lang-rep.lmt /size: 17 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['lang-rep'] = {
2    version   = 1.001,
3    comment   = "companion to lang-rep.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- A BachoTeX 2013 experiment, probably not that useful. Eventually I used a simpler
10-- more generic example. I'm sure no one ever notices of even needs this code.
11--
12-- As a follow up on a question by Alan about special treatment of dropped caps I wonder
13-- if I can make this one more clever (probably in a few more dev steps). For instance
14-- injecting nodes or replacing nodes. It's a prelude to a kind of lpeg for nodes,
15-- although (given experiences so far) we don't really need that. After all, each problem
16-- is somewhat unique.
17
18local type, tonumber, next = type, tonumber, next
19local gmatch, gsub = string.gmatch, string.gsub
20local utfbyte, utfsplit = utf.byte, utf.split
21local P, C, U, Cc, Ct, Cs, lpegmatch = lpeg.P, lpeg.C, lpeg.patterns.utf8character, lpeg.Cc, lpeg.Ct, lpeg.Cs, lpeg.match
22local find = string.find
23
24local zwnj     =  0x200C
25local grouped  = P("{") * ( Ct((U/utfbyte-P("}"))^1) + Cc(false) ) * P("}")-- grouped
26local splitter = Ct((
27                    #P("{") * (
28                        P("{}") / function() return zwnj end
29                      + Ct(Cc("discretionary") * grouped * grouped * grouped)
30                      + Ct(Cc("noligature")    * grouped)
31                    )
32                  + U/utfbyte
33                )^1)
34
35local stripper = P("{") * Cs((1-P(-2))^0) * P("}") * P(-1)
36
37local trace_replacements = false  trackers.register("languages.replacements",         function(v) trace_replacements = v end)
38local trace_details      = false  trackers.register("languages.replacements.details", function(v) trace_details      = v end)
39
40local report_replacement = logs.reporter("languages","replacements")
41
42local glyph_code         = nodes.nodecodes.glyph
43local glue_code          = nodes.nodecodes.glue
44
45local spaceskip_code     = nodes.gluecodes.spaceskip
46local xspaceskip_code    = nodes.gluecodes.xspaceskip
47
48local nuts               = nodes.nuts
49
50local getnext            = nuts.getnext
51local getprev            = nuts.getprev
52local getattr            = nuts.getattr
53local getid              = nuts.getid
54local getsubtype         = nuts.getsubtype
55local getchar            = nuts.getchar
56local isglyph            = nuts.isglyph
57
58local setlink            = nuts.setlink
59local setnext            = nuts.setnext
60local setprev            = nuts.setprev
61local setchar            = nuts.setchar
62local setattrlist        = nuts.setattrlist
63local setoptions         = nuts.setoptions
64
65local glyphoptioncodes       = tex.glyphoptioncodes
66local norightligature_option = glyphoptioncodes.norightligature
67local noleftligature_option  = glyphoptioncodes.noleftligature
68
69local insertbefore       = nuts.insertbefore
70local insertafter        = nuts.insertafter
71local remove_node        = nuts.remove
72local copy_node          = nuts.copy
73local flushlist          = nuts.flushlist
74
75local nodepool           = nuts.pool
76local new_disc           = nodepool.disc
77
78local texsetattribute    = tex.setattribute
79local unsetvalue         = attributes.unsetvalue
80
81local enableaction       = nodes.tasks.enableaction
82
83local v_reset            = interfaces.variables.reset
84
85local implement          = interfaces.implement
86
87local processors         = typesetters.processors
88local splitprocessor     = processors.split
89
90local replacements       = languages.replacements or { }
91languages.replacements   = replacements
92
93local a_replacements     = attributes.private("replacements")
94
95local lists = { }
96local last  = 0
97local trees = { }
98
99table.setmetatableindex(lists,function(lists,name)
100    last = last + 1
101    local list = { }
102    local data = { name = name, list = list, attribute = last }
103    lists[last] = data
104    lists[name] = data
105    trees[last] = list
106    return data
107end)
108
109lists[v_reset].attribute = unsetvalue -- so we discard 0
110
111-- todo: glue kern attr
112
113local function add(root,word,replacement)
114    if type(replacement) == "function" then
115        local list = utfsplit(word) -- ,true)
116        local size = #list
117        for i=1,size do
118            local l = utfbyte(list[i])
119            if not root[l] then
120                root[l] = { }
121            end
122            if i == size then
123                root[l].final = {
124                    word      = word,
125                    replacer  = replacement,
126                    processor = processor,
127                    oldlength = size,
128                }
129            end
130            root = root[l]
131        end
132    else
133        local processor, replacement = splitprocessor(replacement,true) -- no check
134        replacement = lpegmatch(stripper,replacement) or replacement
135        local list = utfsplit(word) -- ,true)
136        local size = #list
137        for i=1,size do
138            local l = utfbyte(list[i])
139            if not root[l] then
140                root[l] = { }
141            end
142            if i == size then
143                local special = find(replacement,"{",1,true)
144                local newlist = lpegmatch(splitter,replacement)
145                root[l].final = {
146                    word        = word,
147                    replacement = replacement,
148                    processor   = processor,
149                    oldlength   = size,
150                    newcodes    = newlist,
151                    special     = special,
152                }
153            end
154            root = root[l]
155        end
156    end
157end
158
159function replacements.add(category,word,replacement)
160    local root = lists[category].list
161    if type(word) == "table" then
162        for word, replacement in next, word do
163            add(root,word,replacement)
164        end
165    else
166        add(root,word,replacement or "")
167    end
168end
169
170-- local strip = lpeg.stripper("{}")
171
172function languages.replacements.addlist(category,list)
173    local root = lists[category].list
174    if type(list) == "string" then
175        for new in gmatch(list,"%S+") do
176            local old = gsub(new,"[{}]","")
177         -- local old = lpegmatch(strip,new)
178            add(root,old,new)
179        end
180    else
181        for i=1,#list do
182            local new = list[i]
183            local old = gsub(new,"[{}]","")
184         -- local old = lpegmatch(strip,new)
185            add(root,old,new)
186        end
187    end
188end
189
190local function tonodes(list,template)
191    local head, current
192    for i=1,#list do
193        local new = copy_node(template)
194        setchar(new,list[i])
195        if head then
196            head, current = insertafter(head,current,new)
197        else
198            head, current = new, new
199        end
200    end
201    return head
202end
203
204local ispunctuation = characters.is_punctuation
205
206-- We can try to be clever and use the fact that there is no match to skip
207-- over to the next word but it is gives fuzzy code so for now I removed
208-- that optimization (when I really need a high performance version myself
209-- I will look into it (but so far I never used this mechanism myself).
210--
211-- We used to have the hit checker as function but is got messy when checks
212-- for punctuation was added.
213
214local function replace(head,first,last,final,hasspace,overload)
215    local current     = first
216    local prefirst    = getprev(first) or head
217    local postlast    = getnext(last)
218    local oldlength   = final.oldlength
219    local newcodes    = final.newcodes
220    local word        = final.word
221    local replacement = final.replacement
222    local replacer    = final.replacer
223    local special     = final.special
224    if type(replacer) == "function" then
225        replacement = replacer(word)
226        if type(replacement) == "string" then
227            special  = find(replacement,"{",1,true)
228            newcodes = lpegmatch(splitter,replacement)
229        else
230            return
231        end
232    end
233    local newlength = newcodes and #newcodes or 0
234    if trace_replacements then
235        report_replacement("replacing word %a by %a",word,replacement)
236    end
237    if hasspace or special then
238        -- It's easier to delete and insert so we do just that. On the todo list is
239        -- turn injected spaces into glue but easier might be to let the char break
240        -- handler do that ...
241        local prev = getprev(current)
242        local next = getnext(last)
243        local list = current
244        setnext(last)
245        setlink(prev,next)
246        current = prev
247        if not current then
248            head = nil
249        end
250        local i = 1
251        while i <= newlength do
252            local codes = newcodes[i]
253            if type(codes) == "table" then
254                local method = codes[1]
255                if method == "discretionary" then
256                    local pre, post, replace = codes[2], codes[3], codes[4]
257                    if pre then
258                        pre = tonodes(pre,first)
259                    end
260                    if post then
261                        post = tonodes(post,first)
262                    end
263                    if replace then
264                        replace = tonodes(replace,first)
265                    end
266                    -- todo: also set attr
267                    local new = new_disc(pre,post,replace)
268                    setattrlist(new,first)
269                    head, current = insertafter(head,current,new)
270                elseif method == "noligature" then
271                    -- not that efficient to copy but ok for testing
272                    local list = codes[2]
273                    if list then
274                        local n = #list
275                        for i=1,n do
276                            local new = copy_node(first)
277                            setchar(new,list[i])
278                            if i == 1 then
279                                setoptions(new,norightligature_option)
280                            elseif i == n then
281                                setoptions(new,glyphoptioncodes.noleftligature | norightligature_option)
282                            else
283                                setoptions(new,glyphoptioncodes.noleftligature)
284                            end
285                            head, current = insertafter(head,current,new)
286                        end
287                    else
288                     -- local new = copy_node(first)
289                     -- setchar(new,zwnj)
290                     -- head, current = insertafter(head,current,new)
291                        setoptions(current,norightligature_option)
292                    end
293                else
294                    report_replacement("unknown method %a",method or "?")
295                end
296            else
297                local new = copy_node(first)
298                setchar(new,codes)
299                head, current = insertafter(head,current,new)
300            end
301            i = i + 1
302        end
303        flushlist(list)
304    elseif newlength == 0 then
305        -- we overload
306    elseif oldlength == newlength then
307        if word ~= replacement then
308            for i=1,newlength do
309                setchar(current,newcodes[i])
310                current = getnext(current)
311            end
312        end
313     -- current = getnext(final) -- some left over? no next!
314    elseif oldlength < newlength then
315        for i=1,newlength-oldlength do
316            local n = copy_node(current)
317            setchar(n,newcodes[i])
318            head, current = insertbefore(head,current,n)
319            current = getnext(current)
320        end
321        for i=newlength-oldlength+1,newlength do
322            setchar(current,newcodes[i])
323            current = getnext(current)
324        end
325    else
326        for i=1,oldlength-newlength do
327            head, current = remove_node(head,current,true)
328        end
329        for i=1,newlength do
330            setchar(current,newcodes[i])
331            current = getnext(current)
332        end
333    end
334    if overload then
335        overload(final,getnext(prefirst),getprev(postlast))
336    end
337    return head, postlast
338end
339
340-- we handle just one space
341
342function replacements.handler(head)
343    local current   = head
344    local overload  = attributes.applyoverloads
345    local mode      = false -- we're in word or punctuation mode
346    local wordstart = false
347    local wordend   = false
348    local prevend   = false
349    local prevfinal = false
350    local tree      = false
351    local root      = false
352    local hasspace  = false
353    while current do
354        local id = getid(current) -- or use the char getter
355        if id == glyph_code then
356            local a = getattr(current,a_replacements)
357            if a then
358                -- we have a run
359                tree = trees[a]
360                if tree then
361                    local char = getchar(current)
362                    local punc = ispunctuation[char]
363                    if mode == "punc" then
364                        if not punc then
365                            if root then
366                                local final = root.final
367                                if final then
368                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
369                                elseif prevfinal then
370                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
371                                end
372                                prevfinal = false
373                                root = false
374                            end
375                            mode = "word"
376                        end
377                    elseif mode == "word" then
378                        if punc then
379                            if root then
380                                local final = root.final
381                                if final then
382                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
383                                elseif prevfinal then
384                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
385                                end
386                                prevfinal = false
387                                root = false
388                            end
389                            mode = "punc"
390                        end
391                    else
392                        mode = punc and "punc" or "word"
393                    end
394                    if root then
395                        root = root[char]
396                        if root then
397                            wordend = current
398                        end
399                    else
400                        if prevfinal then
401                            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
402                            prevfinal = false
403                        end
404                        root = tree[char]
405                        if root then
406                            wordstart = current
407                            wordend   = current
408                            prevend   = false
409                            hasspace  = false
410                        end
411                    end
412                else
413                    root= false
414                end
415            else
416                tree = false
417            end
418            current = getnext(current)
419        elseif root then
420            local final = root.final
421            if mode == "word" and id == glue_code then
422                local s = getsubtype(current)
423                if s == spaceskip_code or s == xspaceskip_code then
424                    local r = root[32] -- maybe more types
425                    if r then
426                        if not prevend then
427                            local f = root.final
428                            if f then
429                                prevend   = wordend
430                                prevfinal = f
431                            end
432                        end
433                        wordend  = current
434                        root     = r
435                        hasspace = true
436                        goto moveon
437                    end
438                end
439            end
440            if final then
441                head, current = replace(head,wordstart,wordend,final,hasspace,overload)
442            elseif prevfinal then
443                head, current = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
444            end
445            prevfinal = false
446            root = false
447          ::moveon::
448            current = getnext(current)
449        else
450            current = getnext(current)
451        end
452    end
453    if root then
454        local final = root.final
455        if final then
456            head = replace(head,wordstart,wordend,final,hasspace,overload)
457        elseif prevfinal then
458            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
459        end
460    end
461    return head
462end
463
464local enabled = false
465
466function replacements.set(n)
467    if n == v_reset then
468        n = unsetvalue
469    else
470        n = lists[n].attribute
471        if not enabled then
472            enableaction("processors","languages.replacements.handler")
473            if trace_replacements then
474                report_replacement("enabling replacement handler")
475            end
476            enabled = true
477        end
478    end
479    texsetattribute(a_replacements,n)
480end
481
482-- interface
483
484implement {
485    name      = "setreplacements",
486    actions   = replacements.set,
487    arguments = "string"
488}
489
490implement {
491    name      = "addreplacements",
492    actions   = replacements.add,
493    arguments = "3 strings",
494}
495
496implement {
497    name      = "addreplacementslist",
498    actions   = replacements.addlist,
499    arguments = "2 strings",
500}
501