mtx-spell.lua /size: 12 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['mtx-patterns'] = {
2    version   = 1.001,
3    comment   = "companion to mtxrun.lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local find, gsub, match = string.find, string.gsub, string.match
10local concat = table.concat
11local P, R, S, C, Ct, Cmt, Cc, Cs =  lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Cs
12local patterns = lpeg.patterns
13local lpegmatch = lpeg.match
14
15local helpinfo = [[
16<?xml version="1.0"?>
17<application>
18 <metadata>
19  <entry name="name">mtx-spell</entry>
20  <entry name="detail">ConTeXt Word Filtering</entry>
21  <entry name="version">0.10</entry>
22 </metadata>
23 <flags>
24  <category name="basic">
25   <subcategory>
26    <flag name="expand"><short>expand hunspell dics and aff files</short></flag>
27    <flag name="dictionary"><short>word file (.dics)</short></flag>
28    <flag name="specification"><short>affix specification file (.aff)</short></flag>
29    <flag name="result"><short>destination file</short></flag>
30   </subcategory>
31  </category>
32 </flags>
33 <examples>
34  <category>
35   <title>Examples</title>
36   <subcategory>
37    <example><command>mtxrun --script spell --expand --dictionary="en_US.dic" --specification="en_US.txt" --result="data-us.txt"</command></example>
38   </subcategory>
39  </category>
40 </examples>
41</application>
42]]
43
44
45local application = logs.application {
46    name     = "mtx-spell",
47    banner   = "ConTeXt Word Filtering 0.10",
48    helpinfo = helpinfo,
49}
50
51local report = application.report
52local trace  = false
53
54scripts       = scripts       or { }
55scripts.spell = scripts.spell or { }
56
57---------------
58
59require("char-def")
60require("char-utf")
61
62-- nl: ij => ij
63
64do
65
66    local prefixes, suffixes, affixes, continue, collected
67
68    local function resetall()
69        prefixes  = table.setmetatableindex("table")
70        suffixes  = table.setmetatableindex("table")
71        affixes   = table.setmetatableindex("table")
72        continue  = { }
73        collected = { }
74    end
75
76    local uppers   = { }
77    local chardata = characters.data
78    for k, v in next, chardata do
79        if v.category == "lu" then
80            uppers[utf.char(k)] = true
81        end
82    end
83
84    local newline = patterns.newline
85    local digit   = patterns.digit
86    local skipped = digit + lpeg.utfchartabletopattern(uppers)
87    local ignored = 1 - newline
88    local garbage = S("'-")
89
90    local function fixeddata(data)
91        data = gsub(data,"ij","ij")
92        return data
93    end
94
95    local function registersuffix(tag,f)
96        table.insert(suffixes[tag],f)
97        table.insert(affixes [tag],f)
98    end
99
100    local function registerprefix(tag,f)
101        table.insert(prefixes[tag],f)
102        table.insert(affixes [tag],f)
103    end
104
105    local function getfixes(specification)
106
107        local data  = fixeddata(io.loaddata(specification) or "")
108        local lines = string.splitlines(data)
109
110        -- /* in two
111        -- Y/N continuation
112
113        -- [^...] [...] ...
114
115        local p0 = nil
116
117        local p1 = P("[^") * Cs((1-P("]"))^1) * P("]") / function(s)
118            local t = utf.split(s)
119            local p = 1 - lpeg.utfchartabletopattern(t)
120            p0 = p0 and (p0 * p) or p
121        end
122        local p2 = P("[") * Cs((1-P("]"))^1) * P("]") / function(s)
123            local t = utf.split(s)
124            local p = lpeg.utfchartabletopattern(t)
125            p0 = p0 and (p0 * p) or p
126        end
127        local p3 = (patterns.utf8char - S("[]"))^1 / function(s)
128            local p = P(s)
129            p0 = p0 and (p0 * p) or p
130        end
131
132        local p = (p1 + p2 + p3)^1
133
134        local function makepattern(s)
135            p0 = nil
136            lpegmatch(p,s)
137            return p0
138        end
139
140        local i = 1
141        while i <= #lines do
142            local line = lines[i]
143            local tag, continuation, n = match(line,"PFX%s+(%S+)%s+(%S+)%s+(%d+)")
144            if tag then
145                n = tonumber(n) or 0
146                continue[tag] = continuation == "Y"
147                for j=1,n do
148                    i = i + 1
149                    line = lines[i]
150                    if not find(line,"[-']") then
151                        local tag, one, two, three = match(line,"PFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)")
152                        if tag then
153                            if one == "0" and two and three == "." then
154                                -- simple case: PFX A 0 re .
155                                registerprefix(tag,function(str)
156                                    local new = two .. str
157                                    if trace then
158                                        print("p 1",str,new)
159                                    end
160                                    return new
161                                end)
162                            elseif one == "0" and two and three then
163                            -- strip begin
164                                if trace then
165                                    print('2',line)
166                                end
167                            elseif one and two and three then
168                                if trace then
169                                    print('3',line)
170                                end
171                            else
172                                if trace then
173                                    print('4',line)
174                                end
175                            end
176                        end
177                    end
178                end
179            end
180            local tag, continuation, n = match(line,"SFX%s+(%S+)%s+(%S+)%s+(%S+)")
181            if tag then
182                n = tonumber(n) or 0
183                continue[tag] = continuation == "Y"
184                for j=1,n do
185                    i = i + 1
186                    line = lines[i]
187                    if not find(line,"[-']") then
188                        local tag, one, two, three = match(line,"SFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)")
189                        if tag then
190                            if one == "0" and two and three == "." then
191                                -- SFX Y 0 ly .
192                                registersuffix(tag,function(str)
193                                    local new = str .. two
194                                    if trace then
195                                        print("s 1",str,new)
196                                    end
197                                    return new
198                                end)
199                            elseif one == "0" and two and three then
200                                -- SFX G 0 ing [^e]
201                                local final = makepattern(three) * P(-1)
202                                local check = (1 - final)^0 * final
203                                registersuffix(tag,function(str)
204                                    if lpegmatch(check,str) then
205                                        local new = str .. two
206                                        if trace then
207                                            print("s 2",str,new)
208                                        end
209                                        return new
210                                    end
211                                end)
212                            elseif one and two and three then
213                                -- SFX G match$ suffix old$ (dutch has sloppy matches, use english as reference)
214                                local final   = makepattern(three) * P(-1)
215                                local check   = (1 - final)^1 * final
216                                local final   = makepattern(one) * P(-1)
217                                local replace = Cs((1 - final)^1 * (final/two))
218                                registersuffix(tag,function(str)
219                                    if lpegmatch(check,str) then
220                                        local new = lpegmatch(replace,str)
221                                        if new then
222                                            if trace then
223                                                print("s 3",str,new)
224                                            end
225                                            return new
226                                        end
227                                    end
228                                end)
229                            else
230                                if trace then
231                                    print('4',line)
232                                end
233                            end
234                        end
235                    end
236                end
237            end
238            i = i + 1
239        end
240    end
241
242    local function expand(_,_,word,spec)
243        if spec then
244            local w = { word }
245            local n = 1
246            for i=1,#spec do
247                local s = spec[i]
248                local affix = affixes[s]
249                if affix then
250                    for i=1,#affix do
251                        local ai = affix[i]
252                        local wi = ai(word)
253                        if wi then
254                            n = n + 1
255                            w[n] = wi
256                            if not continue[s] then
257                                break
258                            end
259                        end
260                    end
261                end
262            end
263            for i=1,n do
264                collected[w[i]] = true
265            end
266--         elseif not find(word,"/") then
267--             collected[word] = true
268        else
269            word = match(word,"^(.+)/")
270            if word and #word > 1 then
271                collected[word] = true
272            end
273        end
274        return true
275    end
276
277    local function getwords(dictionary)
278        local data = fixeddata(io.loaddata(dictionary) or "")
279        local keys = { }
280        for k, v in next, prefixes do
281            keys[k] = true
282        end
283        for k, v in next, suffixes do
284            keys[k] = true
285        end
286        local validkeys = lpeg.utfchartabletopattern(keys)
287        local specifier = P("/") * Ct(C(validkeys)^1)^0 * newline
288        local pattern   = (
289            newline^1
290          + skipped * (1-newline)^0
291          + Cmt(C((1-specifier-newline-garbage)^1) * specifier^0, expand)
292          + ignored^1 * newline^1
293        )^0
294        lpegmatch(pattern,data)
295        collected = table.keys(collected)
296        table.sort(collected)
297        return collected
298    end
299
300    local function saveall(result)
301        if result then
302            io.savedata(result,concat(collected,"\n"))
303        end
304    end
305
306    function scripts.spell.expand(arguments)
307        if arguments then
308            local dictionary    = environment.arguments.dictionary
309            local specification = environment.arguments.specification
310            local result        = environment.arguments.result
311            if type(dictionary) ~= "string" or dictionary == "" then
312                report("missing --dictionary=name")
313            elseif type(specification) ~= "string" or specification == "" then
314                report("missing --specification=name")
315            elseif type(result) ~= "string" or result == "" then
316                report("missing --result=name")
317            else
318                resetall()
319                getfixes(specification)
320                getwords(dictionary)
321                saveall(result)
322                return collected
323            end
324        end
325    end
326
327end
328
329-- spell.dicaff {
330--     dictionary    = "e:/context/spell/lo/en_US.dic.txt",
331--     specification = "e:/context/spell/lo/en_US.aff.txt",
332--     result        = "e:/context/spell/lo/data-en.txt",
333-- }
334
335-- spell.dicaff {
336--     dictionary    = "e:/context/spell/lo/en_GB.dic.txt",
337--     specification = "e:/context/spell/lo/en_GB.aff.txt",
338--     result        = "e:/context/spell/lo/data-uk.txt",
339-- }
340
341-- spell.dicaff {
342--     dictionary    = "e:/context/spell/lo/nl_NL.dic.txt",
343--     specification = "e:/context/spell/lo/nl_NL.aff.txt",
344--     result        = "e:/context/spell/lo/data-nl.txt",
345-- }
346
347if environment.argument("expand") then
348    scripts.spell.expand(environment.arguments)
349elseif environment.argument("exporthelp") then
350    application.export(environment.argument("exporthelp"),environment.files[1])
351else
352    application.help()
353end
354