lang-rep.lmt /size: 17 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['lang-rep'] = {
2    version   = 1.001,
3    comment   = "companion to lang-rep.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- A BachoTeX 2013 experiment, probably not that useful. Eventually I used a simpler
10-- more generic example. I'm sure no one ever notices of even needs this code.
11--
12-- As a follow up on a question by Alan about special treatment of dropped caps I wonder
13-- if I can make this one more clever (probably in a few more dev steps). For instance
14-- injecting nodes or replacing nodes. It's a prelude to a kind of lpeg for nodes,
15-- although (given experiences so far) we don't really need that. After all, each problem
16-- is somewhat unique.
17
18local type, tonumber, next = type, tonumber, next
19local gmatch, gsub = string.gmatch, string.gsub
20local utfbyte, utfsplit = utf.byte, utf.split
21local P, C, U, Cc, Ct, Cs, lpegmatch = lpeg.P, lpeg.C, lpeg.patterns.utf8character, lpeg.Cc, lpeg.Ct, lpeg.Cs, lpeg.match
22local find = string.find
23
24local zwnj <const> =  0x200C
25
26local grouped  = P("{") * ( Ct((U/utfbyte-P("}"))^1) + Cc(false) ) * P("}")-- grouped
27local splitter = Ct((
28                    #P("{") * (
29                        P("{}") / function() return zwnj end
30                      + Ct(Cc("discretionary") * grouped * grouped * grouped)
31                      + Ct(Cc("noligature")    * grouped)
32                    )
33                  + U/utfbyte
34                )^1)
35
36local stripper = P("{") * Cs((1-P(-2))^0) * P("}") * P(-1)
37
38local trace_replacements = false  trackers.register("languages.replacements",         function(v) trace_replacements = v end)
39local trace_details      = false  trackers.register("languages.replacements.details", function(v) trace_details      = v end)
40
41local report_replacement = logs.reporter("languages","replacements")
42
43local glyph_code         <const> = nodes.nodecodes.glyph
44local glue_code          <const> = nodes.nodecodes.glue
45
46local spaceskip_code     <const> = nodes.gluecodes.spaceskip
47local xspaceskip_code    <const> = nodes.gluecodes.xspaceskip
48
49local nuts               = nodes.nuts
50
51local getnext            = nuts.getnext
52local getprev            = nuts.getprev
53local getattr            = nuts.getattr
54local getid              = nuts.getid
55local getsubtype         = nuts.getsubtype
56local getchar            = nuts.getchar
57local isglyph            = nuts.isglyph
58
59local setlink            = nuts.setlink
60local setnext            = nuts.setnext
61local setprev            = nuts.setprev
62local setchar            = nuts.setchar
63local setattrlist        = nuts.setattrlist
64local setoptions         = nuts.setoptions
65
66local glyphoptioncodes       = tex.glyphoptioncodes
67
68local norightligature_option <const> = glyphoptioncodes.norightligature
69local noleftligature_option  <const> = glyphoptioncodes.noleftligature
70
71local insertbefore       = nuts.insertbefore
72local insertafter        = nuts.insertafter
73local remove_node        = nuts.remove
74local copy_node          = nuts.copy
75local flushlist          = nuts.flushlist
76
77local nodepool           = nuts.pool
78local new_disc           = nodepool.disc
79
80local texsetattribute    = tex.setattribute
81
82local unsetvalue         <const> = attributes.unsetvalue
83
84local enableaction       = nodes.tasks.enableaction
85
86local v_reset            <const> = interfaces.variables.reset
87
88local implement          = interfaces.implement
89
90local processors         = typesetters.processors
91local splitprocessor     = processors.split
92
93local replacements       = languages.replacements or { }
94languages.replacements   = replacements
95
96local a_replacements     <const> = attributes.private("replacements")
97
98local lists = { }
99local last  = 0
100local trees = { }
101
102table.setmetatableindex(lists,function(lists,name)
103    last = last + 1
104    local list = { }
105    local data = { name = name, list = list, attribute = last }
106    lists[last] = data
107    lists[name] = data
108    trees[last] = list
109    return data
110end)
111
112lists[v_reset].attribute = unsetvalue -- so we discard 0
113
114-- todo: glue kern attr
115
116local function add(root,word,replacement)
117    if type(replacement) == "function" then
118        local list = utfsplit(word) -- ,true)
119        local size = #list
120        for i=1,size do
121            local l = utfbyte(list[i])
122            if not root[l] then
123                root[l] = { }
124            end
125            if i == size then
126                root[l].final = {
127                    word      = word,
128                    replacer  = replacement,
129                    processor = processor,
130                    oldlength = size,
131                }
132            end
133            root = root[l]
134        end
135    else
136        local processor, replacement = splitprocessor(replacement,true) -- no check
137        replacement = lpegmatch(stripper,replacement) or replacement
138        local list = utfsplit(word) -- ,true)
139        local size = #list
140        for i=1,size do
141            local l = utfbyte(list[i])
142            if not root[l] then
143                root[l] = { }
144            end
145            if i == size then
146                local special = find(replacement,"{",1,true)
147                local newlist = lpegmatch(splitter,replacement)
148                root[l].final = {
149                    word        = word,
150                    replacement = replacement,
151                    processor   = processor,
152                    oldlength   = size,
153                    newcodes    = newlist,
154                    special     = special,
155                }
156            end
157            root = root[l]
158        end
159    end
160end
161
162function replacements.add(category,word,replacement)
163    local root = lists[category].list
164    if type(word) == "table" then
165        for word, replacement in next, word do
166            add(root,word,replacement)
167        end
168    else
169        add(root,word,replacement or "")
170    end
171end
172
173-- local strip = lpeg.stripper("{}")
174
175function languages.replacements.addlist(category,list)
176    local root = lists[category].list
177    if type(list) == "string" then
178        for new in gmatch(list,"%S+") do
179            local old = gsub(new,"[{}]","")
180         -- local old = lpegmatch(strip,new)
181            add(root,old,new)
182        end
183    else
184        for i=1,#list do
185            local new = list[i]
186            local old = gsub(new,"[{}]","")
187         -- local old = lpegmatch(strip,new)
188            add(root,old,new)
189        end
190    end
191end
192
193local function tonodes(list,template)
194    local head, current
195    for i=1,#list do
196        local new = copy_node(template)
197        setchar(new,list[i])
198        if head then
199            head, current = insertafter(head,current,new)
200        else
201            head, current = new, new
202        end
203    end
204    return head
205end
206
207local ispunctuation = characters.is_punctuation
208
209-- We can try to be clever and use the fact that there is no match to skip
210-- over to the next word but it is gives fuzzy code so for now I removed
211-- that optimization (when I really need a high performance version myself
212-- I will look into it (but so far I never used this mechanism myself).
213--
214-- We used to have the hit checker as function but is got messy when checks
215-- for punctuation was added.
216
217local function replace(head,first,last,final,hasspace,overload)
218    local current     = first
219    local prefirst    = getprev(first) or head
220    local postlast    = getnext(last)
221    local oldlength   = final.oldlength
222    local newcodes    = final.newcodes
223    local word        = final.word
224    local replacement = final.replacement
225    local replacer    = final.replacer
226    local special     = final.special
227    if type(replacer) == "function" then
228        replacement = replacer(word)
229        if type(replacement) == "string" then
230            special  = find(replacement,"{",1,true)
231            newcodes = lpegmatch(splitter,replacement)
232        else
233            return
234        end
235    end
236    local newlength = newcodes and #newcodes or 0
237    if trace_replacements then
238        report_replacement("replacing word %a by %a",word,replacement)
239    end
240    if hasspace or special then
241        -- It's easier to delete and insert so we do just that. On the todo list is
242        -- turn injected spaces into glue but easier might be to let the char break
243        -- handler do that ...
244        local prev = getprev(current)
245        local next = getnext(last)
246        local list = current
247        setnext(last)
248        setlink(prev,next)
249        current = prev
250        if not current then
251            head = nil
252        end
253        local i = 1
254        while i <= newlength do
255            local codes = newcodes[i]
256            if type(codes) == "table" then
257                local method = codes[1]
258                if method == "discretionary" then
259                    local pre, post, replace = codes[2], codes[3], codes[4]
260                    if pre then
261                        pre = tonodes(pre,first)
262                    end
263                    if post then
264                        post = tonodes(post,first)
265                    end
266                    if replace then
267                        replace = tonodes(replace,first)
268                    end
269                    -- todo: also set attr
270                    local new = new_disc(pre,post,replace)
271                    setattrlist(new,first)
272                    head, current = insertafter(head,current,new)
273                elseif method == "noligature" then
274                    -- not that efficient to copy but ok for testing
275                    local list = codes[2]
276                    if list then
277                        local n = #list
278                        for i=1,n do
279                            local new = copy_node(first)
280                            setchar(new,list[i])
281                            if i == 1 then
282                                setoptions(new,norightligature_option)
283                            elseif i == n then
284                                setoptions(new,noleftligature_option | norightligature_option)
285                            else
286                                setoptions(new,noleftligature_option)
287                            end
288                            head, current = insertafter(head,current,new)
289                        end
290                    else
291                     -- local new = copy_node(first)
292                     -- setchar(new,zwnj)
293                     -- head, current = insertafter(head,current,new)
294                        setoptions(current,norightligature_option)
295                    end
296                else
297                    report_replacement("unknown method %a",method or "?")
298                end
299            else
300                local new = copy_node(first)
301                setchar(new,codes)
302                head, current = insertafter(head,current,new)
303            end
304            i = i + 1
305        end
306        flushlist(list)
307    elseif newlength == 0 then
308        -- we overload
309    elseif oldlength == newlength then
310        if word ~= replacement then
311            for i=1,newlength do
312                setchar(current,newcodes[i])
313                current = getnext(current)
314            end
315        end
316     -- current = getnext(final) -- some left over? no next!
317    elseif oldlength < newlength then
318        for i=1,newlength-oldlength do
319            local n = copy_node(current)
320            setchar(n,newcodes[i])
321            head, current = insertbefore(head,current,n)
322            current = getnext(current)
323        end
324        for i=newlength-oldlength+1,newlength do
325            setchar(current,newcodes[i])
326            current = getnext(current)
327        end
328    else
329        for i=1,oldlength-newlength do
330            head, current = remove_node(head,current,true)
331        end
332        for i=1,newlength do
333            setchar(current,newcodes[i])
334            current = getnext(current)
335        end
336    end
337    if overload then
338        overload(final,getnext(prefirst),getprev(postlast))
339    end
340    return head, postlast
341end
342
343-- we handle just one space
344
345function replacements.handler(head)
346    local current   = head
347    local overload  = attributes.applyoverloads
348    local mode      = false -- we're in word or punctuation mode
349    local wordstart = false
350    local wordend   = false
351    local prevend   = false
352    local prevfinal = false
353    local tree      = false
354    local root      = false
355    local hasspace  = false
356    while current do
357        local id = getid(current) -- or use the char getter
358        if id == glyph_code then
359            local a = getattr(current,a_replacements)
360            if a then
361                -- we have a run
362                tree = trees[a]
363                if tree then
364                    local char = getchar(current)
365                    local punc = ispunctuation[char]
366                    if mode == "punc" then
367                        if not punc then
368                            if root then
369                                local final = root.final
370                                if final then
371                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
372                                elseif prevfinal then
373                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
374                                end
375                                prevfinal = false
376                                root = false
377                            end
378                            mode = "word"
379                        end
380                    elseif mode == "word" then
381                        if punc then
382                            if root then
383                                local final = root.final
384                                if final then
385                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
386                                elseif prevfinal then
387                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
388                                end
389                                prevfinal = false
390                                root = false
391                            end
392                            mode = "punc"
393                        end
394                    else
395                        mode = punc and "punc" or "word"
396                    end
397                    if root then
398                        root = root[char]
399                        if root then
400                            wordend = current
401                        end
402                    else
403                        if prevfinal then
404                            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
405                            prevfinal = false
406                        end
407                        root = tree[char]
408                        if root then
409                            wordstart = current
410                            wordend   = current
411                            prevend   = false
412                            hasspace  = false
413                        end
414                    end
415                else
416                    root= false
417                end
418            else
419                tree = false
420            end
421            current = getnext(current)
422        elseif root then
423            local final = root.final
424            if mode == "word" and id == glue_code then
425                local s = getsubtype(current)
426                if s == spaceskip_code or s == xspaceskip_code then
427                    local r = root[32] -- maybe more types
428                    if r then
429                        if not prevend then
430                            local f = root.final
431                            if f then
432                                prevend   = wordend
433                                prevfinal = f
434                            end
435                        end
436                        wordend  = current
437                        root     = r
438                        hasspace = true
439                        goto moveon
440                    end
441                end
442            end
443            if final then
444                head, current = replace(head,wordstart,wordend,final,hasspace,overload)
445            elseif prevfinal then
446                head, current = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
447            end
448            prevfinal = false
449            root = false
450          ::moveon::
451            current = getnext(current)
452        else
453            current = getnext(current)
454        end
455    end
456    if root then
457        local final = root.final
458        if final then
459            head = replace(head,wordstart,wordend,final,hasspace,overload)
460        elseif prevfinal then
461            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
462        end
463    end
464    return head
465end
466
467local enabled = false
468
469function replacements.set(n)
470    if n == v_reset then
471        n = unsetvalue
472    else
473        n = lists[n].attribute
474        if not enabled then
475            enableaction("processors","languages.replacements.handler")
476            if trace_replacements then
477                report_replacement("enabling replacement handler")
478            end
479            enabled = true
480        end
481    end
482    texsetattribute(a_replacements,n)
483end
484
485-- interface
486
487implement {
488    name      = "setreplacements",
489    actions   = replacements.set,
490    arguments = "string"
491}
492
493implement {
494    name      = "addreplacements",
495    actions   = replacements.add,
496    arguments = "3 strings",
497}
498
499implement {
500    name      = "addreplacementslist",
501    actions   = replacements.addlist,
502    arguments = "2 strings",
503}
504