char-utf.lua /size: 13 Kb    last modification: 2024-01-16 09:02
1if not modules then modules = { } end modules ['char-utf'] = {
2    version   = 1.001,
3    comment   = "companion to char-utf.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- When a sequence of UTF characters enters the application, it may be
10-- neccessary to collapse subsequences into their composed variant.
11--
12-- This module implements methods for collapsing and expanding UTF sequences. We
13-- also provide means to deal with characters that are special to TeX as well as
14-- 8-bit characters that need to end up in special kinds of output (for instance
15-- PDF).
16--
17-- We implement these manipulations as filters. One can run multiple filters over a
18-- string.
19--
20-- The old code has now been moved to char-obs.lua which we keep around for
21-- educational purposes.
22
23local next, type = next, type
24local gsub, find = string.gsub, string.find
25local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
26local utfchar, utfbyte, utfcharacters = utf.char, utf.byte, utf.characters
27local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
28
29if not characters        then require("char-def") end
30if not characters.blocks then require("char-ini") end
31
32local lpegmatch             = lpeg.match
33local lpegpatterns          = lpeg.patterns
34local p_utf8character       = lpegpatterns.utf8character
35local p_utf8byte            = lpegpatterns.utf8byte
36local utfchartabletopattern = lpeg.utfchartabletopattern
37
38local formatters            = string.formatters
39
40local allocate              = utilities.storage.allocate or function() return { } end
41local mark                  = utilities.storage.mark     or allocate
42
43local charfromnumber        = characters.fromnumber
44
45characters                  = characters or { }
46local characters            = characters
47
48local filters               = allocate()
49characters.filters          = filters
50
51local utffilters            = { }
52characters.filters.utf      = utffilters
53
54local data                  = characters.data
55
56-- It only makes sense to collapse at runtime, since we don't expect source code to
57-- depend on collapsing.
58
59-- for the moment, will be entries in char-def.lua .. this is just a subset that for
60-- typographic (font) reasons we want to have split ... if we decompose all, we get
61-- problems with fonts
62
63local decomposed = allocate {
64    ["IJ"] = "IJ",
65    ["ij"] = "ij",
66    ["և"] = "եւ",
67    [""] = "ff",
68    [""] = "fi",
69    [""] = "fl",
70    [""] = "ffi",
71    [""] = "ffl",
72    [""] = "ſt",
73    [""] = "st",
74    [""] = "մն",
75    [""] = "մե",
76    [""] = "մի",
77    [""] = "վն",
78    [""] = "մխ",
79}
80
81characters.decomposed = decomposed
82
83local graphemes = characters.graphemes
84local collapsed = characters.collapsed
85local combined  = characters.combined
86local mathlists = characters.mathlists
87
88if graphemes then
89
90    mark(graphemes)
91    mark(collapsed)
92    mark(combined)
93    mark(mathlists)
94
95else
96
97    graphemes = allocate()
98    collapsed = allocate()
99    combined  = allocate()
100    mathlists = allocate()
101
102    characters.graphemes = graphemes
103    characters.collapsed = collapsed
104    characters.combined  = combined
105    characters.mathlists = mathlists
106
107    local function backtrack(v,last,target)
108        local vs = v.specials
109        if vs and #vs == 3 then
110            local kind = vs[1]
111            if  kind == "char" or kind == "with" then
112                local one = vs[2]
113                local two = vs[3]
114                local first  = utfchar(one)
115                local second = utfchar(two) .. last
116                collapsed[first..second] = target
117                backtrack(data[one],second,target)
118            end
119        end
120    end
121
122    local function setlist(unicode,list,start,category)
123        if list[start] ~= 0x20 then
124            local t = mathlists
125            for i=start,#list do
126                local l = list[i]
127                local f = t[l]
128                if f then
129                    t = f
130                else
131                    f = { }
132                    t[l] = f
133                    t = f
134                end
135            end
136            t[category] = unicode
137        end
138    end
139
140    local mlists = { }
141
142    for unicode, v in next, data do
143        local vs = v.specials
144        if vs then
145            local kind = vs[1]
146            local size = #vs
147            if kind == "char" or char == "with" then -- with added
148                if size == 3 then
149                    local one = vs[2]
150                    local two = vs[3]
151                    local first       = utfchar(one)
152                    local second      = utfchar(two)
153                    local combination = utfchar(unicode)
154                    --
155                    collapsed[first..second] = combination
156                    backtrack(data[one],second,combination)
157                    -- sort of obsolete:
158                    local cgf = graphemes[first]
159                    if not cgf then
160                        cgf = { [second] = combination }
161                        graphemes[first] = cgf
162                    else
163                        cgf[second] = combination
164                    end
165                end
166                if size > 2 and (v.mathclass or v.mathspec) then
167                    setlist(unicode,vs,2,"specials")
168                end
169            elseif kind == "with" then
170                if size == 3 then
171                 -- combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode)
172                    combined[utfchar(vs[2],vs[3])] = utfchar(unicode)
173                end
174            elseif kind == "compat" then
175                if size == 3 then
176                 -- combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode)
177                    combined[utfchar(vs[2],vs[3])] = utfchar(unicode)
178                end
179                if size > 2 and (v.mathclass or v.mathspec) then
180                    setlist(unicode,vs,2,"specials")
181                end
182            end
183        end
184        local ml = v.mathlist
185        if ml then
186            mlists[unicode] = ml
187        end
188    end
189
190    -- these win:
191
192    for unicode, ml in next, mlists do
193        setlist(unicode,ml,1,"mathlist")
194    end
195
196    mlists = nil
197
198    if storage then
199        storage.register("characters/graphemes", graphemes, "characters.graphemes")
200        storage.register("characters/collapsed", collapsed, "characters.collapsed")
201        storage.register("characters/combined",  combined,  "characters.combined")
202        storage.register("characters/mathlists", mathlists, "characters.mathlists")
203    end
204
205end
206
207function characters.initialize() end -- dummy
208
209local skippable  = { }
210local filesuffix = file.suffix
211
212function utffilters.setskippable(suffix,value)
213    if value == nil then
214        value = true
215    end
216    if type(suffix) == "table" then
217        for i=1,#suffix do
218            skippable[suffix[i]] = value
219        end
220    else
221        skippable[suffix] = value
222    end
223end
224
225local p_collapse = nil -- so we can reset if needed
226
227local function prepare()
228    local tree = utfchartabletopattern(collapsed)
229 -- p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1))
230    p_collapse = Cs((tree/collapsed + p_utf8character)^0)
231end
232
233function utffilters.collapse(str,filename)
234    if not p_collapse then
235        prepare()
236    end
237    if not str or str == "" or #str == 1 then
238        return str
239    elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
240        return str
241    else
242        return lpegmatch(p_collapse,str) or str
243    end
244end
245
246local p_combine = nil -- only for tex
247
248local function prepare()
249    local tree = utfchartabletopattern(combined)
250    p_combine = Cs((tree/combined + p_utf8character)^0)
251end
252
253function utffilters.combine(str) -- not in files
254    -- we could merge collapse into combine ... maybe
255    if not p_combine then
256        prepare()
257    end
258    if not str or str == "" or #str == 1 then
259        return str
260    else
261        return lpegmatch(p_combine,str) or str
262    end
263end
264
265local p_decompose = nil
266
267local function prepare()
268    local tree = utfchartabletopattern(decomposed)
269    p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
270end
271
272function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
273    if not p_decompose then
274        prepare()
275    end
276    if str and str ~= "" and #str > 1 then
277        return lpegmatch(p_decompose,str)
278    end
279    if not str or str == "" or #str < 2 then
280        return str
281    elseif filename and skippable[filesuffix(filename)] then
282        return str
283    else
284        return lpegmatch(p_decompose,str) or str
285    end
286    return str
287end
288
289-- utffilters.addgrapheme(utfchar(318),'l','\string~')
290-- utffilters.addgrapheme('c','a','b')
291
292function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
293    local result = charfromnumber(result)
294    local first  = charfromnumber(first)
295    local second = charfromnumber(second)
296    if not graphemes[first] then
297        graphemes[first] = { [second] = result }
298    else
299        graphemes[first][second] = result
300    end
301    local pair = first .. second
302    if not collapsed[pair] then
303        collapsed[pair] = result
304        p_composed = nil
305    end
306end
307
308if interfaces then -- eventually this goes to char-ctx.lua
309
310    interfaces.implement {
311        name      = "addgrapheme",
312        actions   = utffilters.addgrapheme,
313        arguments = "3 strings",
314    }
315
316end
317
318-- --
319
320local p_reorder = nil
321
322-- local sorter = function(a,b) return b[2] < a[2] end
323--
324-- local function swapper(s,p,t)
325--     local old = { }
326--     for i=1,#t do
327--         old[i] = t[i][1]
328--     end
329--     old = concat(old)
330--     sort(t,sorter)
331--     for i=1,#t do
332--         t[i] = t[i][1]
333--     end
334--     local new = concat(t)
335--     if old ~= new then
336--         print("reordered",old,"->",new)
337--     end
338--     return p, new
339-- end
340
341-- -- the next one into stable for similar weights
342
343local sorter = function(a,b)
344    return b[2] < a[2]
345end
346
347local function swapper(s,p,t)
348    sort(t,sorter)
349    for i=1,#t do
350        t[i] = t[i][1]
351    end
352    return p, concat(t)
353end
354
355-- -- the next one keeps similar weights in the original order
356--
357-- local sorter = function(a,b)
358--     local b2, a2 = b[2], a[2]
359--     if a2 == b2 then
360--         return b[3] > a[3]
361--     else
362--         return b2 < a2
363--     end
364-- end
365--
366-- local function swapper(s,p,t)
367--     for i=1,#t do
368--         t[i][3] = i
369--     end
370--     sort(t,sorter)
371--     for i=1,#t do
372--         t[i] = t[i][1]
373--     end
374--     return p, concat(t)
375-- end
376
377-- at some point exceptions will become an option, for now it's an experiment
378-- to overcome bugs (that have become features) in unicode .. or we might decide
379-- for an extra ordering key in char-def that takes precedence over combining
380
381local exceptions = {
382    -- frozen unicode bug
383    ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
384}
385
386local function prepare()
387    local hash = { }
388    for k, v in sortedhash(characters.data) do
389        local combining = v.combining -- v.ordering or v.combining
390        if combining then
391            local u = utfchar(k)
392            hash[u] = { u, combining, 0 } -- slot 3 can be used in sort
393        end
394    end
395    local e = utfchartabletopattern(exceptions)
396    local p = utfchartabletopattern(hash)
397    p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
398end
399
400function utffilters.reorder(str,filename)
401    if not p_reorder then
402        prepare()
403    end
404    if not str or str == "" or #str < 2 then
405        return str
406    elseif filename and skippable[filesuffix(filename)] then
407        return str
408    else
409        return lpegmatch(p_reorder,str) or str
410    end
411    return str
412end
413
414-- local collapse   = utffilters.collapse
415-- local decompose  = utffilters.decompose
416-- local reorder    = utffilters.reorder
417--
418-- local c1, c2, c3 = "a", "̂", "̃"
419-- local r2, r3 = "â", "ẫ"
420-- local l1 = "ffl"
421--
422-- local str  = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
423-- local res  = r3 .. " " .. r2 .. " " .. "ffl"
424--
425-- local text  = io.loaddata("t:/sources/tufte.tex")
426--
427-- local function test(n)
428--     local data = text .. string.rep(str,100) .. text
429--     local okay = text .. string.rep(res,100) .. text
430--     local t = os.clock()
431--     for i=1,10000 do
432--         collapse(data)
433--         decompose(data)
434--      -- reorder(data)
435--     end
436--     print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
437-- end
438--
439-- test(050)
440-- test(150)
441--
442-- local old = "foo" .. string.char(0xE1) .. "bar"
443-- local new = collapse(old)
444-- print(old,new)
445
446-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
447-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
448--
449-- print(one_old,two_old,one_old==two_old,false)
450-- print(one_new,two_new,one_new==two_new,true)
451--
452-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
453-- local done = utffilters.reorder(test)
454--
455-- print(test,done,test==done,false)
456
457local f_default     = formatters["[%U] "]
458local f_description = formatters["[%s] "]
459
460local function convert(n)
461    local d = data[n]
462    d = d and d.description
463    if d then
464        return f_description(d)
465    else
466        return f_default(n)
467    end
468end
469
470local pattern = Cs((p_utf8byte / convert)^1)
471
472function utffilters.verbose(data)
473    return data and lpegmatch(pattern,data) or ""
474end
475
476return characters
477