mtx-spell.lua /size: 11 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['mtx-patterns'] = {
2    version   = 1.001,
3    comment   = "companion to mtxrun.lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local find, gsub, match = string.find, string.gsub, string.match
10local concat = table.concat
11local P, R, S, C, Ct, Cmt, Cc, Cs =  lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Cs
12local patterns = lpeg.patterns
13local lpegmatch = lpeg.match
14
15local helpinfo = [[
16<?xml version="1.0"?>
17<application>
18 <metadata>
19  <entry name="name">mtx-spell</entry>
20  <entry name="detail">ConTeXt Word Filtering</entry>
21  <entry name="version">0.10</entry>
22 </metadata>
23 <flags>
24  <category name="basic">
25   <subcategory>
26    <flag name="expand"><short>expand hunspell dics and aff files</short></flag>
27    <flag name="dictionary"><short>word file (.dics)</short></flag>
28    <flag name="specification"><short>affix specification file (.aff)</short></flag>
29    <flag name="result"><short>destination file</short></flag>
30   </subcategory>
31  </category>
32 </flags>
33 <examples>
34  <category>
35   <title>Examples</title>
36   <subcategory>
37    <example><command>mtxrun --script spell --expand --dictionary="en_US.dic" --specification="en_US.txt" --result="data-us.txt"</command></example>
38   </subcategory>
39  </category>
40 </examples>
41</application>
42]]
43
44
45local application = logs.application {
46    name     = "mtx-spell",
47    banner   = "ConTeXt Word Filtering 0.10",
48    helpinfo = helpinfo,
49}
50
51local report = application.report
52local trace  = false
53
54scripts       = scripts       or { }
55scripts.spell = scripts.spell or { }
56
57---------------
58
59require("char-def")
60require("char-utf")
61
62-- nl: ij => ij
63
64do
65
66    local prefixes, suffixes, affixes, continue, collected
67
68    local function resetall()
69        prefixes  = table.setmetatableindex("table")
70        suffixes  = table.setmetatableindex("table")
71        affixes   = table.setmetatableindex("table")
72        continue  = { }
73        collected = { }
74    end
75
76    local uppers   = { }
77    local chardata = characters.data
78    for k, v in next, chardata do
79        if v.category == "lu" then
80            uppers[utf.char(k)] = true
81        end
82    end
83
84    local newline = patterns.newline
85    local digit   = patterns.digit
86    local skipped = digit + lpeg.utfchartabletopattern(uppers)
87    local ignored = 1 - newline
88    local garbage = S("'-")
89
90    local function fixeddata(data)
91        data = gsub(data,"ij","ij")
92        return data
93    end
94
95    local function registersuffix(tag,f)
96        table.insert(suffixes[tag],f)
97        table.insert(affixes [tag],f)
98    end
99
100    local function registerprefix(tag,f)
101        table.insert(prefixes[tag],f)
102        table.insert(affixes [tag],f)
103    end
104
105    local function getfixes(specification)
106
107        local data  = fixeddata(io.loaddata(specification) or "")
108        local lines = string.splitlines(data)
109
110        -- /* in two
111        -- Y/N continuation
112
113        -- [^...] [...] ...
114
115        local p0 = nil
116
117        local p1 = P("[^") * Cs((1-P("]"))^1) * P("]") / function(s)
118            local t = utf.split(s)
119            local p = 1 - lpeg.utfchartabletopattern(t)
120            p0 = p0 and (p0 * p) or p
121        end
122        local p2 = P("[") * Cs((1-P("]"))^1) * P("]") / function(s)
123            local t = utf.split(s)
124            local p = lpeg.utfchartabletopattern(t)
125            p0 = p0 and (p0 * p) or p
126        end
127        local p3 = (patterns.utf8char - S("[]"))^1 / function(s)
128            local p = P(s)
129            p0 = p0 and (p0 * p) or p
130        end
131
132        local p = (p1 + p2 + p3)^1
133
134        local function makepattern(s)
135            p0 = nil
136            lpegmatch(p,s)
137            return p0
138        end
139
140        local i = 1
141        while i <= #lines do
142            local line = lines[i]
143            local tag, continuation, n = match(line,"PFX%s+(%S+)%s+(%S+)%s+(%d+)")
144            if tag then
145                n = tonumber(n) or 0
146                continue[tag] = continuation == "Y"
147                for j=1,n do
148                    i = i + 1
149                    line = lines[i]
150                    if not find(line,"[-']") then
151                        local tag, one, two, three = match(line,"PFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)")
152                        if tag then
153                            if one == "0" and two and three == "." then
154                                -- simple case: PFX A 0 re .
155                                registerprefix(tag,function(str)
156                                    local new = two .. str
157                                    if trace then
158                                        print("p 1",str,new)
159                                    end
160                                    return new
161                                end)
162                            elseif one == "0" and two and three then
163                            -- strip begin
164                                if trace then
165                                    print('2',line)
166                                end
167                            elseif one and two and three then
168                                if trace then
169                                    print('3',line)
170                                end
171                            else
172                                if trace then
173                                    print('4',line)
174                                end
175                            end
176                        end
177                    end
178                end
179            end
180            local tag, continuation, n = match(line,"SFX%s+(%S+)%s+(%S+)%s+(%S+)")
181            if tag then
182                n = tonumber(n) or 0
183                continue[tag] = continuation == "Y"
184                for j=1,n do
185                    i = i + 1
186                    line = lines[i]
187                    if not find(line,"[-']") then
188                        local tag, one, two, three = match(line,"SFX%s+(%S+)%s+(%S+)%s+([^%s/]+)%S*%s+(%S+)")
189                        if tag then
190                            if one == "0" and two and three == "." then
191                                -- SFX Y 0 ly .
192                                registersuffix(tag,function(str)
193                                    local new = str .. two
194                                    if trace then
195                                        print("s 1",str,new)
196                                    end
197                                    return new
198                                end)
199                            elseif one == "0" and two and three then
200                                -- SFX G 0 ing [^e]
201                                local final = makepattern(three) * P(-1)
202                                local check = (1 - final)^0 * final
203                                registersuffix(tag,function(str)
204                                    if lpegmatch(check,str) then
205                                        local new = str .. two
206                                        if trace then
207                                            print("s 2",str,new)
208                                        end
209                                        return new
210                                    end
211                                end)
212                            elseif one and two and three then
213                                -- SFX G match$ suffix old$ (dutch has sloppy matches, use english as reference)
214                                local final   = makepattern(three) * P(-1)
215                                local check   = (1 - final)^1 * final
216                                local final   = makepattern(one) * P(-1)
217                                local replace = Cs((1 - final)^1 * (final/two))
218                                registersuffix(tag,function(str)
219                                    if lpegmatch(check,str) then
220                                        local new = lpegmatch(replace,str)
221                                        if new then
222                                            if trace then
223                                                print("s 3",str,new)
224                                            end
225                                            return new
226                                        end
227                                    end
228                                end)
229                            else
230                                if trace then
231                                    print('4',line)
232                                end
233                            end
234                        end
235                    end
236                end
237            end
238            i = i + 1
239        end
240    end
241
242    local function expand(_,_,word,spec)
243        if spec then
244            local w = { word }
245            local n = 1
246            for i=1,#spec do
247                local s = spec[i]
248                local affix = affixes[s]
249                if affix then
250                    for i=1,#affix do
251                        local ai = affix[i]
252                        local wi = ai(word)
253                        if wi then
254                            n = n + 1
255                            w[n] = wi
256                            if not continue[s] then
257                                break
258                            end
259                        end
260                    end
261                end
262            end
263            for i=1,n do
264                collected[w[i]] = true
265            end
266        elseif not find(word,"/") then
267            collected[word] = true
268        end
269        return true
270    end
271
272    local function getwords(dictionary)
273        local data = fixeddata(io.loaddata(dictionary) or "")
274        local keys = { }
275        for k, v in next, prefixes do
276            keys[k] = true
277        end
278        for k, v in next, suffixes do
279            keys[k] = true
280        end
281        local validkeys = lpeg.utfchartabletopattern(keys)
282        local specifier = P("/") * Ct(C(validkeys)^1)^0 * newline
283        local pattern   = (
284            newline^1
285          + skipped * (1-newline)^0
286          + Cmt(C((1-specifier-newline-garbage)^1) * specifier^0, expand)
287          + ignored^1 * newline^1
288        )^0
289        lpegmatch(pattern,data)
290        collected = table.keys(collected)
291        table.sort(collected)
292        return collected
293    end
294
295    local function saveall(result)
296        if result then
297            io.savedata(result,concat(collected,"\n"))
298        end
299    end
300
301    function scripts.spell.expand(arguments)
302        if arguments then
303            local dictionary    = environment.arguments.dictionary
304            local specification = environment.arguments.specification
305            local result        = environment.arguments.result
306            if type(dictionary) ~= "string" or dictionary == "" then
307                report("missing --dictionary=name")
308            elseif type(specification) ~= "string" or specification == "" then
309                report("missing --specification=name")
310            elseif type(result) ~= "string" or result == "" then
311                resetall()
312                getfixes(specification)
313                getwords(dictionary)
314                saveall(result)
315                return collected
316            end
317        end
318    end
319
320end
321
322-- spell.dicaff {
323--     dictionary    = "e:/context/spell/lo/en_US.dic.txt",
324--     specification = "e:/context/spell/lo/en_US.aff.txt",
325--     result        = "e:/context/spell/lo/data-en.txt",
326-- }
327
328-- spell.dicaff {
329--     dictionary    = "e:/context/spell/lo/en_GB.dic.txt",
330--     specification = "e:/context/spell/lo/en_GB.aff.txt",
331--     result        = "e:/context/spell/lo/data-uk.txt",
332-- }
333
334-- spell.dicaff {
335--     dictionary    = "e:/context/spell/lo/nl_NL.dic.txt",
336--     specification = "e:/context/spell/lo/nl_NL.aff.txt",
337--     result        = "e:/context/spell/lo/data-nl.txt",
338-- }
339
340if environment.argument("expand") then
341    scripts.spell.expand(environment.arguments)
342elseif environment.argument("exporthelp") then
343    application.export(environment.argument("exporthelp"),environment.files[1])
344else
345    application.help()
346end
347