lang-wrd.lua /size: 11 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['lang-wrd'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local next, tonumber = next, tonumber
10local lower = string.lower
11local utfchar = utf.char
12local concat, setmetatableindex = table.concat, table.setmetatableindex
13local lpegmatch = lpeg.match
14local P, S, Cs, Cf, Cg, Cc, C = lpeg.P, lpeg.S, lpeg.Cs, lpeg.Cf, lpeg.Cg, lpeg.Cc, lpeg.C
15
16local report_words = logs.reporter("languages","words")
17
18local nodes           = nodes
19local languages       = languages
20
21local implement       = interfaces.implement
22
23languages.words       = languages.words or { }
24local words           = languages.words
25
26words.data            = words.data or { }
27words.enables         = false
28words.threshold       = 4
29
30local numbers         = languages.numbers
31local registered      = languages.registered
32
33local nuts            = nodes.nuts
34
35----- getfield        = nuts.getfield
36local getnext         = nuts.getnext
37local getid           = nuts.getid
38----- getsubtype      = nuts.getsubtype
39local getchar         = nuts.getchar
40local setattr         = nuts.setattr
41----- getattr         = nuts.getattr
42local getlanguage     = nuts.getlanguage
43local ischar          = nuts.ischar
44
45local nextnode        = nuts.traversers.node
46----- nextglyph       = nuts.traversers.glyph
47
48local wordsdata       = words.data
49local chardata        = characters.data
50local enableaction    = nodes.tasks.enableaction
51
52local unsetvalue      = attributes.unsetvalue
53
54local nodecodes       = nodes.nodecodes
55----- kerncodes       = nodes.kerncodes
56
57local glyph_code      = nodecodes.glyph
58----- disc_code       = nodecodes.disc
59----- kern_code       = nodecodes.kern
60
61----- fontkern_code   = kerncodes.fontkern
62
63local lowerchar       = characters.lower
64
65local a_color         = attributes.private('color')
66local colist          = attributes.list[a_color]
67
68local is_letter       = characters.is_letter -- maybe is_character as variant
69
70local spacing = S(" \n\r\t")
71local markup  = S("-=") / ""
72local lbrace  = P("{") / ""
73local rbrace  = P("}") / ""
74local snippet = lbrace * (1-rbrace)^0 * rbrace
75local disc    = snippet/"" -- pre
76              * snippet/"" -- post
77              * snippet    -- replace
78local word    = Cs((markup + disc + (1-spacing))^1)
79
80-- lpegmatch((spacing + word/function(s) print(s) end)^0,"foo foo-bar bar{}{}{}foo  bar{}{}{foo}")
81
82local loaded  = { } -- we share lists
83local loaders = {
84    txt = function(list,fullname)
85        local data = io.loaddata(fullname)
86        if data and data ~= "" then
87            local parser = (spacing + word/function(s) list[s] = true end)^0
88         -- local parser = Cf(Cc(list) * Cg(spacing^0 * word * Cc(true))^1,rawset) -- not better
89            lpegmatch(parser,data)
90        end
91    end,
92    lua = function(list,fullname)
93        local data = dofile(fullname)
94        if data and type(data) == "table" then
95            local words = data.words
96            if words then
97                for k, v in next, words do
98                    list[k] = true
99                end
100            end
101        end
102    end,
103}
104
105loaders.luc = loaders.lua
106
107function words.load(tag,filename)
108    local fullname = resolvers.findfile(filename,'other text file') or ""
109    if fullname ~= "" then
110        report_words("loading word file %a",fullname)
111        statistics.starttiming(languages)
112        local list = loaded[fullname]
113        if not list then
114            list = wordsdata[tag] or { }
115            local suffix = file.suffix(fullname)
116            local loader = loaders[suffix] or loaders.txt
117            loader(list,fullname)
118            loaded[fullname] = list
119        end
120        wordsdata[tag] = list
121        statistics.stoptiming(languages)
122    else
123        report_words("missing word file %a",filename)
124    end
125end
126
127function words.found(id, str)
128    local tag = languages.numbers[id]
129    if tag then
130        local data = wordsdata[tag]
131        if data then
132            if data[str] then
133                return 1
134            elseif data[lower(str)] then
135                return 2
136            end
137        end
138    end
139end
140
141-- The following code is an adaption of experimental code for hyphenating and
142-- spell checking.
143
144-- there is an n=1 problem somewhere in nested boxes
145
146local function mark_words(head,whenfound) -- can be optimized and shared
147    local current, language = head, nil, nil, 0
148    local str, s, nds, n = { }, 0, { }, 0 -- n could also be a table, saves calls
149    local function action()
150        if s > 0 then
151            local word = concat(str,"",1,s)
152            local mark = whenfound(language,word)
153            if mark then
154                for i=1,n do
155                    mark(nds[i])
156                end
157            end
158        end
159        n, s = 0, 0
160    end
161    -- we haven't done the fonts yet so we have characters (otherwise
162    -- we'd have to use the tounicodes)
163    while current do
164        local code, id = ischar(current) -- not isglyph because otherwise we can run into
165        if code then                     -- processed streams (\about[foo] does that)
166            local a = getlanguage(current)
167            if a then
168                if a ~= language then
169                    if s > 0 then
170                        action()
171                    end
172                    language = a
173                end
174            elseif s > 0 then
175                action()
176                language = a
177            end
178            local data = chardata[code]
179            if is_letter[data.category] then
180                n = n + 1
181                nds[n] = current
182                s = s + 1
183                str[s] = utfchar(code)
184            elseif s > 0 then
185                action()
186            end
187     -- elseif id == disc_code then
188     --     -- take the replace .. we kick in before we hyphenate so we're
189     --     -- not yet seeing many discs and we only handle explicit ones
190     --     -- in fact we could as well decide to ignore words with a disc
191     --     -- because we then have a compound word
192     --     if n > 0 then
193     --         local r = getfield(current,"replace")
194     --         if r then
195     --             -- also disc itself
196     --             n = n + 1
197     --             nds[n] = current
198     --             --
199     --             for current in nextglyph, r do
200     --                 local code = getchar(current)
201     --                 n = n + 1
202     --                 nds[n] = current
203     --                 s = s + 1
204     --                 str[s] = utfchar(code)
205     --             end
206     --         end
207     --     end
208     -- elseif id == kern_code and getsubtype(current) == fontkern_code and s > 0 then
209     --     -- ok
210        elseif s > 0 then
211            action()
212        end
213        current = getnext(current)
214    end
215    if s > 0 then
216        action()
217    end
218    return head
219end
220
221local methods  = { }
222words.methods  = methods
223
224local enablers = { }
225words.enablers = enablers
226
227local wordmethod = 1
228local enabled    = false
229
230function words.check(head)
231    if enabled then
232        return methods[wordmethod](head)
233    elseif not head then
234        return head, false
235    else
236        return head, false
237    end
238end
239
240function words.enable(settings)
241    local method = settings.method
242    wordmethod = method and tonumber(method) or wordmethod or 1
243    local e = enablers[wordmethod]
244    if e then
245        e(settings)
246    end
247    enableaction("processors","languages.words.check")
248    enabled = true
249end
250
251function words.disable()
252    enabled = false
253end
254
255-- colors
256
257local cache = { } -- can also be done with method 1 -- frozen colors once used
258
259table.setmetatableindex(cache, function(t,k) -- k == language, numbers[k] == tag
260    local c
261    if type(k) == "string" then
262        c = colist[k]
263    elseif k < 0 then
264        c = colist["word:unset"]
265    else
266        c = colist["word:" .. (numbers[k] or "unset")] or colist["word:unknown"]
267    end
268    local v = c and function(n) setattr(n,a_color,c) end or false
269    t[k] = v
270    return v
271end)
272
273-- method 1
274
275local function sweep(language,str)
276    if #str < words.threshold then
277        return false
278    elseif words.found(language,str) then -- can become a local wordsfound
279        return cache["word:yes"] -- maybe variables.yes
280    else
281        return cache["word:no"]
282    end
283end
284
285methods[1] = function(head)
286    for n in nextnode, head do
287        setattr(n,a_color,unsetvalue) -- hm, not that selective (reset color)
288    end
289    return mark_words(head,sweep)
290end
291
292-- method 2
293
294local dumpname   = nil
295local dumpthem   = false
296local listname   = "document"
297
298local category   = { }
299
300local categories = setmetatableindex(function(t,k)
301    local languages = setmetatableindex(function(t,k)
302        local r = registered[k]
303        local v = {
304            number   = language,
305            parent   = r and r.parent   or nil,
306            patterns = r and r.patterns or nil,
307            tag      = r and r.tag      or nil,
308            list     = { },
309            total    = 0,
310            unique   = 0,
311        }
312        t[k] = v
313        return v
314    end)
315    local v = {
316        languages = languages,
317        total     = 0,
318    }
319    t[k] = v
320    return v
321end)
322
323local collected  = {
324    total      = 0,
325    version    = 1.000,
326    categories = categories,
327}
328
329enablers[2] = function(settings)
330    local name = settings.list
331    listname = name and name ~= "" and name or "document"
332    category = collected.categories[listname]
333end
334
335local function sweep(language,str)
336    if #str >= words.threshold then
337        str = lowerchar(str)
338        local words = category.languages[numbers[language] or "unset"]
339        local list = words.list
340        local ls = list[str]
341        if ls then
342            list[str] = ls + 1
343        else
344            list[str] = 1
345            words.unique = words.unique + 1
346        end
347        collected.total = collected.total + 1
348        category.total = category.total + 1
349        words.total = words.total + 1
350    end
351end
352
353methods[2] = function(head)
354    dumpthem = true
355    return mark_words(head,sweep)
356end
357
358local function dumpusedwords()
359    if dumpthem then
360        collected.threshold = words.threshold
361        dumpname = dumpname or file.addsuffix(tex.jobname,"words")
362        report_words("saving list of used words in %a",dumpname)
363        io.savedata(dumpname,table.serialize(collected,true))
364     -- table.tofile(dumpname,list,true)
365    end
366end
367
368directives.register("languages.words.dump", function(v)
369    dumpname = (type(v) == "string" and v ~= "" and v) or dumpname
370end)
371
372luatex.registerstopactions(dumpusedwords)
373
374-- method 3
375
376local function sweep(language,str)
377    return cache[language]
378end
379
380methods[3] = function(head)
381    for n in nextnode, head do
382        setattr(n,a_color,unsetvalue)
383    end
384    return mark_words(head,sweep)
385end
386
387-- for the moment we hook it into the attribute handler
388
389-- languagehacks = { }
390
391-- function languagehacks.process(namespace,attribute,head)
392--     return languages.check(head)
393-- end
394
395-- chars.plugins[chars.plugins+1] = {
396--     name = "language",
397--     namespace = languagehacks,
398--     processor = languagehacks.process
399-- }
400
401-- interface
402
403implement {
404    name      = "enablespellchecking",
405    actions   = words.enable,
406    arguments = {
407        {
408            { "method" },
409            { "list" }
410        }
411    }
412}
413
414implement {
415    name      = "disablespellchecking",
416    actions   = words.disable
417}
418
419implement {
420    name      = "loadspellchecklist",
421    arguments = "2 strings",
422    actions   = words.load
423}
424