char-utf.lua /size: 13 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['char-utf'] = {
2    version   = 1.001,
3    comment   = "companion to char-utf.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9--[[ldx--
10<p>When a sequence of <l n='utf'/> characters enters the application, it may be
11neccessary to collapse subsequences into their composed variant.</p>
12
13<p>This module implements methods for collapsing and expanding <l n='utf'/>
14sequences. We also provide means to deal with characters that are special to
15<l n='tex'/> as well as 8-bit characters that need to end up in special kinds
16of output (for instance <l n='pdf'/>).</p>
17
18<p>We implement these manipulations as filters. One can run multiple filters
19over a string.</p>
20
21<p>The old code has now been moved to char-obs.lua which we keep around for
22educational purposes.</p>
23--ldx]]--
24
25local next, type = next, type
26local gsub, find = string.gsub, string.find
27local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort
28local utfchar, utfbyte, utfcharacters = utf.char, utf.byte, utf.characters
29local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct
30
31if not characters        then require("char-def") end
32if not characters.blocks then require("char-ini") end
33
34local lpegmatch             = lpeg.match
35local lpegpatterns          = lpeg.patterns
36local p_utf8character       = lpegpatterns.utf8character
37local p_utf8byte            = lpegpatterns.utf8byte
38local utfchartabletopattern = lpeg.utfchartabletopattern
39
40local formatters            = string.formatters
41
42local allocate              = utilities.storage.allocate or function() return { } end
43local mark                  = utilities.storage.mark     or allocate
44
45local charfromnumber        = characters.fromnumber
46
47characters                  = characters or { }
48local characters            = characters
49
50local filters               = allocate()
51characters.filters          = filters
52
53local utffilters            = { }
54characters.filters.utf      = utffilters
55
56local data                  = characters.data
57
58--[[ldx--
59<p>It only makes sense to collapse at runtime, since we don't expect source code
60to depend on collapsing.</p>
61--ldx]]--
62
63-- for the moment, will be entries in char-def.lua .. this is just a subset that for
64-- typographic (font) reasons we want to have split ... if we decompose all, we get
65-- problems with fonts
66
67local decomposed = allocate {
68    ["IJ"] = "IJ",
69    ["ij"] = "ij",
70    ["և"] = "եւ",
71    [""] = "ff",
72    [""] = "fi",
73    [""] = "fl",
74    [""] = "ffi",
75    [""] = "ffl",
76    [""] = "ſt",
77    [""] = "st",
78    [""] = "մն",
79    [""] = "մե",
80    [""] = "մի",
81    [""] = "վն",
82    [""] = "մխ",
83}
84
85characters.decomposed = decomposed
86
87local graphemes = characters.graphemes
88local collapsed = characters.collapsed
89local combined  = characters.combined
90local mathlists = characters.mathlists
91
92if graphemes then
93
94    mark(graphemes)
95    mark(collapsed)
96    mark(combined)
97    mark(mathlists)
98
99else
100
101    graphemes = allocate()
102    collapsed = allocate()
103    combined  = allocate()
104    mathlists = allocate()
105
106    characters.graphemes = graphemes
107    characters.collapsed = collapsed
108    characters.combined  = combined
109    characters.mathlists = mathlists
110
111    local function backtrack(v,last,target)
112        local vs = v.specials
113        if vs and #vs == 3 then
114            local kind = vs[1]
115            if  kind == "char" or kind == "with" then
116                local one = vs[2]
117                local two = vs[3]
118                local first  = utfchar(one)
119                local second = utfchar(two) .. last
120                collapsed[first..second] = target
121                backtrack(data[one],second,target)
122            end
123        end
124    end
125
126    local function setlist(unicode,list,start,category)
127        if list[start] ~= 0x20 then
128            local t = mathlists
129            for i=start,#list do
130                local l = list[i]
131                local f = t[l]
132                if f then
133                    t = f
134                else
135                    f = { }
136                    t[l] = f
137                    t = f
138                end
139            end
140            t[category] = unicode
141        end
142    end
143
144    local mlists = { }
145
146    for unicode, v in next, data do
147        local vs = v.specials
148        if vs then
149            local kind = vs[1]
150            local size = #vs
151            if kind == "char" or char == "with" then -- with added
152                if size == 3 then
153                    local one = vs[2]
154                    local two = vs[3]
155                    local first       = utfchar(one)
156                    local second      = utfchar(two)
157                    local combination = utfchar(unicode)
158                    --
159                    collapsed[first..second] = combination
160                    backtrack(data[one],second,combination)
161                    -- sort of obsolete:
162                    local cgf = graphemes[first]
163                    if not cgf then
164                        cgf = { [second] = combination }
165                        graphemes[first] = cgf
166                    else
167                        cgf[second] = combination
168                    end
169                end
170                if size > 2 and (v.mathclass or v.mathspec) then
171                    setlist(unicode,vs,2,"specials")
172                end
173            elseif kind == "with" then
174                if size == 3 then
175                    combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode)
176                end
177            elseif kind == "compat" then
178                if size == 3 then
179                    combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode)
180                end
181                if size > 2 and (v.mathclass or v.mathspec) then
182                    setlist(unicode,vs,2,"specials")
183                end
184            end
185        end
186        local ml = v.mathlist
187        if ml then
188            mlists[unicode] = ml
189        end
190    end
191
192    -- these win:
193
194    for unicode, ml in next, mlists do
195        setlist(unicode,ml,1,"mathlist")
196    end
197
198    mlists = nil
199
200    if storage then
201        storage.register("characters/graphemes", graphemes, "characters.graphemes")
202        storage.register("characters/collapsed", collapsed, "characters.collapsed")
203        storage.register("characters/combined",  combined,  "characters.combined")
204        storage.register("characters/mathlists", mathlists, "characters.mathlists")
205    end
206
207end
208
209function characters.initialize() end -- dummy
210
211local skippable  = { }
212local filesuffix = file.suffix
213
214function utffilters.setskippable(suffix,value)
215    if value == nil then
216        value = true
217    end
218    if type(suffix) == "table" then
219        for i=1,#suffix do
220            skippable[suffix[i]] = value
221        end
222    else
223        skippable[suffix] = value
224    end
225end
226
227local p_collapse = nil -- so we can reset if needed
228
229local function prepare()
230    local tree = utfchartabletopattern(collapsed)
231 -- p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1))
232    p_collapse = Cs((tree/collapsed + p_utf8character)^0)
233end
234
235function utffilters.collapse(str,filename)
236    if not p_collapse then
237        prepare()
238    end
239    if not str or str == "" or #str == 1 then
240        return str
241    elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test
242        return str
243    else
244        return lpegmatch(p_collapse,str) or str
245    end
246end
247
248local p_combine = nil -- only for tex
249
250local function prepare()
251    local tree = utfchartabletopattern(combined)
252    p_combine = Cs((tree/combined + p_utf8character)^0)
253end
254
255function utffilters.combine(str) -- not in files
256    -- we could merge collapse into combine ... maybe
257    if not p_combine then
258        prepare()
259    end
260    if not str or str == "" or #str == 1 then
261        return str
262    else
263        return lpegmatch(p_combine,str) or str
264    end
265end
266
267local p_decompose = nil
268
269local function prepare()
270    local tree = utfchartabletopattern(decomposed)
271    p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1))
272end
273
274function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above
275    if not p_decompose then
276        prepare()
277    end
278    if str and str ~= "" and #str > 1 then
279        return lpegmatch(p_decompose,str)
280    end
281    if not str or str == "" or #str < 2 then
282        return str
283    elseif filename and skippable[filesuffix(filename)] then
284        return str
285    else
286        return lpegmatch(p_decompose,str) or str
287    end
288    return str
289end
290
291-- utffilters.addgrapheme(utfchar(318),'l','\string~')
292-- utffilters.addgrapheme('c','a','b')
293
294function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number
295    local result = charfromnumber(result)
296    local first  = charfromnumber(first)
297    local second = charfromnumber(second)
298    if not graphemes[first] then
299        graphemes[first] = { [second] = result }
300    else
301        graphemes[first][second] = result
302    end
303    local pair = first .. second
304    if not collapsed[pair] then
305        collapsed[pair] = result
306        p_composed = nil
307    end
308end
309
310if interfaces then -- eventually this goes to char-ctx.lua
311
312    interfaces.implement {
313        name      = "addgrapheme",
314        actions   = utffilters.addgrapheme,
315        arguments = "3 strings",
316    }
317
318end
319
320-- --
321
322local p_reorder = nil
323
324-- local sorter = function(a,b) return b[2] < a[2] end
325--
326-- local function swapper(s,p,t)
327--     local old = { }
328--     for i=1,#t do
329--         old[i] = t[i][1]
330--     end
331--     old = concat(old)
332--     sort(t,sorter)
333--     for i=1,#t do
334--         t[i] = t[i][1]
335--     end
336--     local new = concat(t)
337--     if old ~= new then
338--         print("reordered",old,"->",new)
339--     end
340--     return p, new
341-- end
342
343-- -- the next one into stable for similar weights
344
345local sorter = function(a,b)
346    return b[2] < a[2]
347end
348
349local function swapper(s,p,t)
350    sort(t,sorter)
351    for i=1,#t do
352        t[i] = t[i][1]
353    end
354    return p, concat(t)
355end
356
357-- -- the next one keeps similar weights in the original order
358--
359-- local sorter = function(a,b)
360--     local b2, a2 = b[2], a[2]
361--     if a2 == b2 then
362--         return b[3] > a[3]
363--     else
364--         return b2 < a2
365--     end
366-- end
367--
368-- local function swapper(s,p,t)
369--     for i=1,#t do
370--         t[i][3] = i
371--     end
372--     sort(t,sorter)
373--     for i=1,#t do
374--         t[i] = t[i][1]
375--     end
376--     return p, concat(t)
377-- end
378
379-- at some point exceptions will become an option, for now it's an experiment
380-- to overcome bugs (that have become features) in unicode .. or we might decide
381-- for an extra ordering key in char-def that takes precedence over combining
382
383local exceptions = {
384    -- frozen unicode bug
385    ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E
386}
387
388local function prepare()
389    local hash = { }
390    for k, v in sortedhash(characters.data) do
391        local combining = v.combining -- v.ordering or v.combining
392        if combining then
393            local u = utfchar(k)
394            hash[u] = { u, combining, 0 } -- slot 3 can be used in sort
395        end
396    end
397    local e = utfchartabletopattern(exceptions)
398    local p = utfchartabletopattern(hash)
399    p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1)
400end
401
402function utffilters.reorder(str,filename)
403    if not p_reorder then
404        prepare()
405    end
406    if not str or str == "" or #str < 2 then
407        return str
408    elseif filename and skippable[filesuffix(filename)] then
409        return str
410    else
411        return lpegmatch(p_reorder,str) or str
412    end
413    return str
414end
415
416-- local collapse   = utffilters.collapse
417-- local decompose  = utffilters.decompose
418-- local reorder    = utffilters.reorder
419--
420-- local c1, c2, c3 = "a", "̂", "̃"
421-- local r2, r3 = "â", "ẫ"
422-- local l1 = "ffl"
423--
424-- local str  = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
425-- local res  = r3 .. " " .. r2 .. " " .. "ffl"
426--
427-- local text  = io.loaddata("t:/sources/tufte.tex")
428--
429-- local function test(n)
430--     local data = text .. string.rep(str,100) .. text
431--     local okay = text .. string.rep(res,100) .. text
432--     local t = os.clock()
433--     for i=1,10000 do
434--         collapse(data)
435--         decompose(data)
436--      -- reorder(data)
437--     end
438--     print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
439-- end
440--
441-- test(050)
442-- test(150)
443--
444-- local old = "foo" .. string.char(0xE1) .. "bar"
445-- local new = collapse(old)
446-- print(old,new)
447
448-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
449-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
450--
451-- print(one_old,two_old,one_old==two_old,false)
452-- print(one_new,two_new,one_new==two_new,true)
453--
454-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
455-- local done = utffilters.reorder(test)
456--
457-- print(test,done,test==done,false)
458
459local f_default     = formatters["[%U] "]
460local f_description = formatters["[%s] "]
461
462local function convert(n)
463    local d = data[n]
464    d = d and d.description
465    if d then
466        return f_description(d)
467    else
468        return f_default(n)
469    end
470end
471
472local pattern = Cs((p_utf8byte / convert)^1)
473
474function utffilters.verbose(data)
475    return data and lpegmatch(pattern,data) or ""
476end
477
478return characters
479