lang-wrd.lua /size: 11 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['lang-wrd'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local next, tonumber = next, tonumber
10local lower = string.lower
11local utfchar = utf.char
12local concat, setmetatableindex = table.concat, table.setmetatableindex
13local lpegmatch = lpeg.match
14local P, S, Cs, Cc, C = lpeg.P, lpeg.S, lpeg.Cs, lpeg.Cc, lpeg.C
15
16local report_words = logs.reporter("languages","words")
17
18local nodes           = nodes
19local languages       = languages
20
21local implement       = interfaces.implement
22
23languages.words       = languages.words or { }
24local words           = languages.words
25
26words.data            = words.data or { }
27words.enables         = false
28words.threshold       = 4
29
30local numbers         = languages.numbers
31local registered      = languages.registered
32
33local nuts            = nodes.nuts
34
35----- getfield        = nuts.getfield
36local getnext         = nuts.getnext
37local getid           = nuts.getid
38----- getsubtype      = nuts.getsubtype
39local getchar         = nuts.getchar
40local setattr         = nuts.setattr
41----- getattr         = nuts.getattr
42local getlanguage     = nuts.getlanguage
43local ischar          = nuts.ischar
44
45local nextnode        = nuts.traversers.node
46----- nextglyph       = nuts.traversers.glyph
47
48local wordsdata       = words.data
49local chardata        = characters.data
50local enableaction    = nodes.tasks.enableaction
51
52local unsetvalue      = attributes.unsetvalue
53
54local nodecodes       = nodes.nodecodes
55----- kerncodes       = nodes.kerncodes
56
57local glyph_code      = nodecodes.glyph
58----- disc_code       = nodecodes.disc
59----- kern_code       = nodecodes.kern
60
61----- fontkern_code   = kerncodes.fontkern
62
63local lowerchar       = characters.lower
64
65local a_color         = attributes.private('color')
66local colist          = attributes.list[a_color]
67
68local is_letter       = characters.is_letter -- maybe is_character as variant
69
70local spacing = S(" \n\r\t")
71local markup  = S("-=") / ""
72local lbrace  = P("{") / ""
73local rbrace  = P("}") / ""
74local snippet = lbrace * (1-rbrace)^0 * rbrace
75local disc    = snippet/"" -- pre
76              * snippet/"" -- post
77              * snippet    -- replace
78local word    = Cs((markup + disc + (1-spacing))^1)
79
80-- lpegmatch((spacing + word/function(s) print(s) end)^0,"foo foo-bar bar{}{}{}foo  bar{}{}{foo}")
81
82local loaded  = { } -- we share lists
83local loaders = {
84    txt = function(list,fullname)
85        local data = io.loaddata(fullname)
86        if data and data ~= "" then
87            local parser = (spacing + word/function(s) list[s] = true end)^0
88            lpegmatch(parser,data)
89        end
90    end,
91    lua = function(list,fullname)
92        local data = dofile(fullname)
93        if data and type(data) == "table" then
94            local words = data.words
95            if words then
96                for k, v in next, words do
97                    list[k] = true
98                end
99            end
100        end
101    end,
102}
103
104loaders.luc = loaders.lua
105
106function words.load(tag,filename)
107    local fullname = resolvers.findfile(filename,'other text file') or ""
108    if fullname ~= "" then
109        report_words("loading word file %a",fullname)
110        statistics.starttiming(languages)
111        local list = loaded[fullname]
112        if not list then
113            list = wordsdata[tag] or { }
114            local suffix = file.suffix(fullname)
115            local loader = loaders[suffix] or loaders.txt
116            loader(list,fullname)
117            loaded[fullname] = list
118        end
119        wordsdata[tag] = list
120        statistics.stoptiming(languages)
121    else
122        report_words("missing word file %a",filename)
123    end
124end
125
126function words.found(id, str)
127    local tag = languages.numbers[id]
128    if tag then
129        local data = wordsdata[tag]
130        if data then
131            if data[str] then
132                return 1
133            elseif data[lower(str)] then
134                return 2
135            end
136        end
137    end
138end
139
140-- The following code is an adaption of experimental code for hyphenating and
141-- spell checking.
142
143-- there is an n=1 problem somewhere in nested boxes
144
145local function mark_words(head,whenfound) -- can be optimized and shared
146    local current, language = head, nil, nil, 0
147    local str, s, nds, n = { }, 0, { }, 0 -- n could also be a table, saves calls
148    local function action()
149        if s > 0 then
150            local word = concat(str,"",1,s)
151            local mark = whenfound(language,word)
152            if mark then
153                for i=1,n do
154                    mark(nds[i])
155                end
156            end
157        end
158        n, s = 0, 0
159    end
160    -- we haven't done the fonts yet so we have characters (otherwise
161    -- we'd have to use the tounicodes)
162    while current do
163        local code, id = ischar(current) -- not isglyph because otherwise we can run into
164        if code then                     -- processed streams (\about[foo] does that)
165            local a = getlanguage(current)
166            if a then
167                if a ~= language then
168                    if s > 0 then
169                        action()
170                    end
171                    language = a
172                end
173            elseif s > 0 then
174                action()
175                language = a
176            end
177            local data = chardata[code]
178            if is_letter[data.category] then
179                n = n + 1
180                nds[n] = current
181                s = s + 1
182                str[s] = utfchar(code)
183            elseif s > 0 then
184                action()
185            end
186     -- elseif id == disc_code then
187     --     -- take the replace .. we kick in before we hyphenate so we're
188     --     -- not yet seeing many discs and we only handle explicit ones
189     --     -- in fact we could as well decide to ignore words with a disc
190     --     -- because we then have a compound word
191     --     if n > 0 then
192     --         local r = getfield(current,"replace")
193     --         if r then
194     --             -- also disc itself
195     --             n = n + 1
196     --             nds[n] = current
197     --             --
198     --             for current in nextglyph, r do
199     --                 local code = getchar(current)
200     --                 n = n + 1
201     --                 nds[n] = current
202     --                 s = s + 1
203     --                 str[s] = utfchar(code)
204     --             end
205     --         end
206     --     end
207     -- elseif id == kern_code and getsubtype(current) == fontkern_code and s > 0 then
208     --     -- ok
209        elseif s > 0 then
210            action()
211        end
212        current = getnext(current)
213    end
214    if s > 0 then
215        action()
216    end
217    return head
218end
219
220local methods  = { }
221words.methods  = methods
222
223local enablers = { }
224words.enablers = enablers
225
226local wordmethod = 1
227local enabled    = false
228
229function words.check(head)
230    if enabled then
231        return methods[wordmethod](head)
232    elseif not head then
233        return head, false
234    else
235        return head, false
236    end
237end
238
239function words.enable(settings)
240    local method = settings.method
241    wordmethod = method and tonumber(method) or wordmethod or 1
242    local e = enablers[wordmethod]
243    if e then
244        e(settings)
245    end
246    enableaction("processors","languages.words.check")
247    enabled = true
248end
249
250function words.disable()
251    enabled = false
252end
253
254-- colors
255
256local cache = { } -- can also be done with method 1 -- frozen colors once used
257
258table.setmetatableindex(cache, function(t,k) -- k == language, numbers[k] == tag
259    local c
260    if type(k) == "string" then
261        c = colist[k]
262    elseif k < 0 then
263        c = colist["word:unset"]
264    else
265        c = colist["word:" .. (numbers[k] or "unset")] or colist["word:unknown"]
266    end
267    local v = c and function(n) setattr(n,a_color,c) end or false
268    t[k] = v
269    return v
270end)
271
272-- method 1
273
274local function sweep(language,str)
275    if #str < words.threshold then
276        return false
277    elseif words.found(language,str) then -- can become a local wordsfound
278        return cache["word:yes"] -- maybe variables.yes
279    else
280        return cache["word:no"]
281    end
282end
283
284methods[1] = function(head)
285    for n in nextnode, head do
286        setattr(n,a_color,unsetvalue) -- hm, not that selective (reset color)
287    end
288    return mark_words(head,sweep)
289end
290
291-- method 2
292
293local dumpname   = nil
294local dumpthem   = false
295local listname   = "document"
296
297local category   = { }
298
299local categories = setmetatableindex(function(t,k)
300    local languages = setmetatableindex(function(t,k)
301        local r = registered[k]
302        local v = {
303            number   = language,
304            parent   = r and r.parent   or nil,
305            patterns = r and r.patterns or nil,
306            tag      = r and r.tag      or nil,
307            list     = { },
308            total    = 0,
309            unique   = 0,
310        }
311        t[k] = v
312        return v
313    end)
314    local v = {
315        languages = languages,
316        total     = 0,
317    }
318    t[k] = v
319    return v
320end)
321
322local collected  = {
323    total      = 0,
324    version    = 1.000,
325    categories = categories,
326}
327
328enablers[2] = function(settings)
329    local name = settings.list
330    listname = name and name ~= "" and name or "document"
331    category = collected.categories[listname]
332end
333
334local function sweep(language,str)
335    if #str >= words.threshold then
336        str = lowerchar(str)
337        local words = category.languages[numbers[language] or "unset"]
338        local list = words.list
339        local ls = list[str]
340        if ls then
341            list[str] = ls + 1
342        else
343            list[str] = 1
344            words.unique = words.unique + 1
345        end
346        collected.total = collected.total + 1
347        category.total = category.total + 1
348        words.total = words.total + 1
349    end
350end
351
352methods[2] = function(head)
353    dumpthem = true
354    return mark_words(head,sweep)
355end
356
357local function dumpusedwords()
358    if dumpthem then
359        collected.threshold = words.threshold
360        dumpname = dumpname or file.addsuffix(tex.jobname,"words")
361        report_words("saving list of used words in %a",dumpname)
362        io.savedata(dumpname,table.serialize(collected,true))
363     -- table.tofile(dumpname,list,true)
364    end
365end
366
367directives.register("languages.words.dump", function(v)
368    dumpname = (type(v) == "string" and v ~= "" and v) or dumpname
369end)
370
371luatex.registerstopactions(dumpusedwords)
372
373-- method 3
374
375local function sweep(language,str)
376    return cache[language]
377end
378
379methods[3] = function(head)
380    for n in nextnode, head do
381        setattr(n,a_color,unsetvalue)
382    end
383    return mark_words(head,sweep)
384end
385
386-- for the moment we hook it into the attribute handler
387
388-- languagehacks = { }
389
390-- function languagehacks.process(namespace,attribute,head)
391--     return languages.check(head)
392-- end
393
394-- chars.plugins[chars.plugins+1] = {
395--     name = "language",
396--     namespace = languagehacks,
397--     processor = languagehacks.process
398-- }
399
400-- interface
401
402implement {
403    name      = "enablespellchecking",
404    actions   = words.enable,
405    arguments = {
406        {
407            { "method" },
408            { "list" }
409        }
410    }
411}
412
413implement {
414    name      = "disablespellchecking",
415    actions   = words.disable
416}
417
418implement {
419    name      = "loadspellchecklist",
420    arguments = "2 strings",
421    actions   = words.load
422}
423