lang-rep.lua /size: 15 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['lang-rep'] = {
2    version   = 1.001,
3    comment   = "companion to lang-rep.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- A BachoTeX 2013 experiment, probably not that useful. Eventually I used a simpler
10-- more generic example. I'm sure no one ever notices of even needs this code.
11--
12-- As a follow up on a question by Alan about special treatment of dropped caps I wonder
13-- if I can make this one more clever (probably in a few more dev steps). For instance
14-- injecting nodes or replacing nodes. It's a prelude to a kind of lpeg for nodes,
15-- although (given experiences so far) we don't really need that. After all, each problem
16-- is somewhat unique.
17
18local type, tonumber, next = type, tonumber, next
19local gmatch, gsub = string.gmatch, string.gsub
20local utfbyte, utfsplit = utf.byte, utf.split
21local P, C, U, Cc, Ct, Cs, lpegmatch = lpeg.P, lpeg.C, lpeg.patterns.utf8character, lpeg.Cc, lpeg.Ct, lpeg.Cs, lpeg.match
22local find = string.find
23
24local zwnj     =  0x200C
25local grouped  = P("{") * ( Ct((U/utfbyte-P("}"))^1) + Cc(false) ) * P("}")-- grouped
26local splitter = Ct((
27                    #P("{") * (
28                        P("{}") / function() return zwnj end
29                      + Ct(Cc("discretionary") * grouped * grouped * grouped)
30                      + Ct(Cc("noligature")    * grouped)
31                    )
32                  + U/utfbyte
33                )^1)
34
35local stripper = P("{") * Cs((1-P(-2))^0) * P("}") * P(-1)
36
37local trace_replacements = false  trackers.register("languages.replacements",         function(v) trace_replacements = v end)
38local trace_details      = false  trackers.register("languages.replacements.details", function(v) trace_details      = v end)
39
40local report_replacement = logs.reporter("languages","replacements")
41
42local glyph_code         = nodes.nodecodes.glyph
43local glue_code          = nodes.nodecodes.glue
44
45local spaceskip_code     = nodes.gluecodes.spaceskip
46local xspaceskip_code    = nodes.gluecodes.xspaceskip
47
48local nuts               = nodes.nuts
49
50local getnext            = nuts.getnext
51local getprev            = nuts.getprev
52local getattr            = nuts.getattr
53local getid              = nuts.getid
54local getsubtype         = nuts.getsubtype
55local getchar            = nuts.getchar
56local isglyph            = nuts.isglyph
57
58local setattr            = nuts.setattr
59local setlink            = nuts.setlink
60local setnext            = nuts.setnext
61local setprev            = nuts.setprev
62local setchar            = nuts.setchar
63local setattrlist        = nuts.setattrlist
64
65local insertbefore       = nuts.insertbefore
66local insertafter        = nuts.insertafter
67local remove_node        = nuts.remove
68local copy_node          = nuts.copy
69local flushlist          = nuts.flushlist
70
71local nodepool           = nuts.pool
72local new_disc           = nodepool.disc
73
74local texsetattribute    = tex.setattribute
75local unsetvalue         = attributes.unsetvalue
76
77local enableaction       = nodes.tasks.enableaction
78
79local v_reset            = interfaces.variables.reset
80
81local implement          = interfaces.implement
82
83local processors         = typesetters.processors
84local splitprocessor     = processors.split
85
86local replacements       = languages.replacements or { }
87languages.replacements   = replacements
88
89local a_replacements     = attributes.private("replacements")
90local a_noligature       = attributes.private("noligature")      -- to be adapted to lmtx !
91
92local lists = { }
93local last  = 0
94local trees = { }
95
96table.setmetatableindex(lists,function(lists,name)
97    last = last + 1
98    local list = { }
99    local data = { name = name, list = list, attribute = last }
100    lists[last] = data
101    lists[name] = data
102    trees[last] = list
103    return data
104end)
105
106lists[v_reset].attribute = unsetvalue -- so we discard 0
107
108-- todo: glue kern attr
109
110local function add(root,word,replacement)
111    local processor, replacement = splitprocessor(replacement,true) -- no check
112    replacement = lpegmatch(stripper,replacement) or replacement
113    local list = utfsplit(word) -- ,true)
114    local size = #list
115    for i=1,size do
116        local l = utfbyte(list[i])
117        if not root[l] then
118            root[l] = { }
119        end
120        if i == size then
121            local special = find(replacement,"{",1,true)
122            local newlist = lpegmatch(splitter,replacement)
123            root[l].final = {
124                word        = word,
125                replacement = replacement,
126                processor   = processor,
127                oldlength   = size,
128                newcodes    = newlist,
129                special     = special,
130            }
131        end
132        root = root[l]
133    end
134end
135
136function replacements.add(category,word,replacement)
137    local root = lists[category].list
138    if type(word) == "table" then
139        for word, replacement in next, word do
140            add(root,word,replacement)
141        end
142    else
143        add(root,word,replacement or "")
144    end
145end
146
147-- local strip = lpeg.stripper("{}")
148
149function languages.replacements.addlist(category,list)
150    local root = lists[category].list
151    if type(list) == "string" then
152        for new in gmatch(list,"%S+") do
153            local old = gsub(new,"[{}]","")
154         -- local old = lpegmatch(strip,new)
155            add(root,old,new)
156        end
157    else
158        for i=1,#list do
159            local new = list[i]
160            local old = gsub(new,"[{}]","")
161         -- local old = lpegmatch(strip,new)
162            add(root,old,new)
163        end
164    end
165end
166
167local function tonodes(list,template)
168    local head, current
169    for i=1,#list do
170        local new = copy_node(template)
171        setchar(new,list[i])
172        if head then
173            head, current = insertafter(head,current,new)
174        else
175            head, current = new, new
176        end
177    end
178    return head
179end
180
181local is_punctuation = characters.is_punctuation
182
183-- We can try to be clever and use the fact that there is no match to skip
184-- over to the next word but it is gives fuzzy code so for now I removed
185-- that optimization (when I really need a high performance version myself
186-- I will look into it (but so far I never used this mechanism myself).
187--
188-- We used to have the hit checker as function but is got messy when checks
189-- for punctuation was added.
190
191local function replace(head,first,last,final,hasspace,overload)
192    local current   = first
193    local prefirst  = getprev(first) or head
194    local postlast  = getnext(last)
195    local oldlength = final.oldlength
196    local newcodes  = final.newcodes
197    local newlength = newcodes and #newcodes or 0
198    if trace_replacements then
199        report_replacement("replacing word %a by %a",final.word,final.replacement)
200    end
201    if hasspace or final.special then
202        -- It's easier to delete and insert so we do just that. On the todo list is
203        -- turn injected spaces into glue but easier might be to let the char break
204        -- handler do that ...
205        local prev = getprev(current)
206        local next = getnext(last)
207        local list = current
208        setnext(last)
209        setlink(prev,next)
210        current = prev
211        if not current then
212            head = nil
213        end
214        local i = 1
215        while i <= newlength do
216            local codes = newcodes[i]
217            if type(codes) == "table" then
218                local method = codes[1]
219                if method == "discretionary" then
220                    local pre, post, replace = codes[2], codes[3], codes[4]
221                    if pre then
222                        pre = tonodes(pre,first)
223                    end
224                    if post then
225                        post = tonodes(post,first)
226                    end
227                    if replace then
228                        replace = tonodes(replace,first)
229                    end
230                    -- todo: also set attr
231                    local new = new_disc(pre,post,replace)
232                    setattrlist(new,first)
233                    head, current = insertafter(head,current,new)
234                elseif method == "noligature" then
235                    -- not that efficient to copy but ok for testing
236                    local list = codes[2]
237                    if list then
238                        for i=1,#list do
239                            local new = copy_node(first)
240                            setchar(new,list[i])
241                            setattr(new,a_noligature,1)
242                            head, current = insertafter(head,current,new)
243                        end
244                    else
245                        local new = copy_node(first)
246                        setchar(new,zwnj)
247                        head, current = insertafter(head,current,new)
248                    end
249                else
250                    report_replacement("unknown method %a",method or "?")
251                end
252            else
253                local new = copy_node(first)
254                setchar(new,codes)
255                head, current = insertafter(head,current,new)
256            end
257            i = i + 1
258        end
259        flushlist(list)
260    elseif newlength == 0 then
261        -- we overload
262    elseif oldlength == newlength then
263        if final.word ~= final.replacement then
264            for i=1,newlength do
265                setchar(current,newcodes[i])
266                current = getnext(current)
267            end
268        end
269        current = getnext(final)
270    elseif oldlength < newlength then
271        for i=1,newlength-oldlength do
272            local n = copy_node(current)
273            setchar(n,newcodes[i])
274            head, current = insertbefore(head,current,n)
275            current = getnext(current)
276        end
277        for i=newlength-oldlength+1,newlength do
278            setchar(current,newcodes[i])
279            current = getnext(current)
280        end
281    else
282        for i=1,oldlength-newlength do
283            head, current = remove_node(head,current,true)
284        end
285        for i=1,newlength do
286            setchar(current,newcodes[i])
287            current = getnext(current)
288        end
289    end
290    if overload then
291        overload(final,getnext(prefirst),getprev(postlast))
292    end
293    return head, postlast
294end
295
296-- we handle just one space
297
298function replacements.handler(head)
299    local current   = head
300    local overload  = attributes.applyoverloads
301    local mode      = false -- we're in word or punctuation mode
302    local wordstart = false
303    local wordend   = false
304    local prevend   = false
305    local prevfinal = false
306    local tree      = false
307    local root      = false
308    local hasspace  = false
309    while current do
310        local id = getid(current) -- or use the char getter
311        if id == glyph_code then
312            local a = getattr(current,a_replacements)
313            if a then
314                -- we have a run
315                tree = trees[a]
316                if tree then
317                    local char = getchar(current)
318                    local punc = is_punctuation[char]
319                    if mode == "punc" then
320                        if not punc then
321                            if root then
322                                local final = root.final
323                                if final then
324                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
325                                elseif prevfinal then
326                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
327                                end
328                                prevfinal = false
329                                root = false
330                            end
331                            mode = "word"
332                        end
333                    elseif mode == "word" then
334                        if punc then
335                            if root then
336                                local final = root.final
337                                if final then
338                                    head = replace(head,wordstart,wordend,final,hasspace,overload)
339                                elseif prevfinal then
340                                    head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
341                                end
342                                prevfinal = false
343                                root = false
344                            end
345                            mode = "punc"
346                        end
347                    else
348                        mode = punc and "punc" or "word"
349                    end
350                    if root then
351                        root = root[char]
352                        if root then
353                            wordend = current
354                        end
355                    else
356                        if prevfinal then
357                            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
358                            prevfinal = false
359                        end
360                        root = tree[char]
361                        if root then
362                            wordstart = current
363                            wordend   = current
364                            prevend   = false
365                            hasspace  = false
366                        end
367                    end
368                else
369                    root= false
370                end
371            else
372                tree = false
373            end
374            current = getnext(current)
375        elseif root then
376            local final = root.final
377            if mode == "word" and id == glue_code then
378                local s = getsubtype(current)
379                if s == spaceskip_code or s == xspaceskip_code then
380                    local r = root[32] -- maybe more types
381                    if r then
382                        if not prevend then
383                            local f = root.final
384                            if f then
385                                prevend   = wordend
386                                prevfinal = f
387                            end
388                        end
389                        wordend  = current
390                        root     = r
391                        hasspace = true
392                        goto moveon
393                    end
394                end
395            end
396            if final then
397                head, current = replace(head,wordstart,wordend,final,hasspace,overload)
398            elseif prevfinal then
399                head, current = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
400            end
401            prevfinal = false
402            root = false
403          ::moveon::
404            current = getnext(current)
405        else
406            current = getnext(current)
407        end
408    end
409    if root then
410        local final = root.final
411        if final then
412            head = replace(head,wordstart,wordend,final,hasspace,overload)
413        elseif prevfinal then
414            head = replace(head,wordstart,prevend,prevfinal,hasspace,overload)
415        end
416    end
417    return head
418end
419
420local enabled = false
421
422function replacements.set(n)
423    if n == v_reset then
424        n = unsetvalue
425    else
426        n = lists[n].attribute
427        if not enabled then
428            enableaction("processors","languages.replacements.handler")
429            if trace_replacements then
430                report_replacement("enabling replacement handler")
431            end
432            enabled = true
433        end
434    end
435    texsetattribute(a_replacements,n)
436end
437
438-- interface
439
440implement {
441    name      = "setreplacements",
442    actions   = replacements.set,
443    arguments = "string"
444}
445
446implement {
447    name      = "addreplacements",
448    actions   = replacements.add,
449    arguments = "3 strings",
450}
451
452implement {
453    name      = "addreplacementslist",
454    actions   = replacements.addlist,
455    arguments = "2 strings",
456}
457