mtx-unicode.lua /size: 35 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['mtx-unicode'] = {
2    version   = 1.002,
3    comment   = "companion to mtxrun.lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This is very old code that I started writing in 2005 but occasionally
10-- extended. Don't use it yourself, it's just a sort of reference. The
11-- data that we use in ConTeXt is more extensive.
12--
13-- In my local tree I keep files in places like this:
14--
15--    e:/tex-context/tex/texmf-local/data/unicode/blocks.txt
16--
17-- curl -o arabicshaping.txt             http://www.unicode.org/Public/UNIDATA/ArabicShaping.txt
18-- curl -o bidibrackets.txt              http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
19-- curl -o bidicharactertest.txt         http://www.unicode.org/Public/UNIDATA/BidiCharacterTest.txt
20-- curl -o bidimirroring.txt             http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt
21-- curl -o biditest.txt                  http://www.unicode.org/Public/UNIDATA/BidiTest.txt
22-- curl -o blocks.txt                    http://www.unicode.org/Public/UNIDATA/Blocks.txt
23-- curl -o cjkradicals.txt               http://www.unicode.org/Public/UNIDATA/CJKRadicals.txt
24-- curl -o casefolding.txt               http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
25-- curl -o compositionexclusions.txt     http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
26-- curl -o derivedage.txt                http://www.unicode.org/Public/UNIDATA/DerivedAge.txt
27-- curl -o derivedcoreproperties.txt     http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
28-- curl -o derivednormalizationprops.txt http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
29-- curl -o eastasianwidth.txt            http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
30-- curl -o emojisources.txt              http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
31-- curl -o hangulsyllabletype.txt        http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
32-- curl -o index.txt                     http://www.unicode.org/Public/UNIDATA/Index.txt
33-- curl -o indicpositionalcategory.txt   http://www.unicode.org/Public/UNIDATA/IndicPositionalCategory.txt
34-- curl -o indicsyllabiccategory.txt     http://www.unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt
35-- curl -o jamo.txt                      http://www.unicode.org/Public/UNIDATA/Jamo.txt
36-- curl -o linebreak.txt                 http://www.unicode.org/Public/UNIDATA/LineBreak.txt
37-- curl -o namealiases.txt               http://www.unicode.org/Public/UNIDATA/NameAliases.txt
38-- curl -o namedsequences.txt            http://www.unicode.org/Public/UNIDATA/NamedSequences.txt
39-- curl -o namedsequencesprov.txt        http://www.unicode.org/Public/UNIDATA/NamedSequencesProv.txt
40-- curl -o nameslist.html                http://www.unicode.org/Public/UNIDATA/NamesList.html
41-- curl -o nameslist.txt                 http://www.unicode.org/Public/UNIDATA/NamesList.txt
42-- curl -o normalizationcorrections.txt  http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
43-- curl -o normalizationtest.txt         http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
44-- curl -o proplist.txt                  http://www.unicode.org/Public/UNIDATA/PropList.txt
45-- curl -o propertyaliases.txt           http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
46-- curl -o propertyvaluealiases.txt      http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
47-- curl -o readme.txt                    http://www.unicode.org/Public/UNIDATA/ReadMe.txt
48-- curl -o scriptextensions.txt          http://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
49-- curl -o scripts.txt                   http://www.unicode.org/Public/UNIDATA/Scripts.txt
50-- curl -o specialcasing.txt             http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
51-- curl -o standardizedvariants.html     http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
52-- curl -o standardizedvariants.txt      http://www.unicode.org/Public/UNIDATA/StandardizedVariants.txt
53-- curl -o tangutsources.txt             http://www.unicode.org/Public/UNIDATA/TangutSources.txt
54-- curl -o ucd.zip                       http://www.unicode.org/Public/UNIDATA/UCD.zip
55-- curl -o usourcedata.txt               http://www.unicode.org/Public/UNIDATA/USourceData.txt
56-- curl -o usourceglyphs.pdf             http://www.unicode.org/Public/UNIDATA/USourceGlyphs.pdf
57-- curl -o unicodedata.txt               http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
58-- curl -o unihan.zip                    http://www.unicode.org/Public/UNIDATA/Unihan.zip
59--
60-- curl -o emoji-data.txt                http://unicode.org/Public/emoji/12.0/emoji-data.txt
61-- curl -o emoji-sequences.txt           http://unicode.org/Public/emoji/12.0/emoji-sequences.txt
62-- curl -o emoji-variation-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-variation-sequences.txt
63-- curl -o emoji-zwj-sequences.txt       http://unicode.org/Public/emoji/12.0/emoji-zwj-sequences.txt
64-- curl -o emoji-test.txt                http://unicode.org/Public/emoji/12.0/emoji-test.txt
65--
66-- todo:
67--
68--    specialcasing ?
69
70local helpinfo = [[
71<?xml version="1.0"?>
72<application>
73 <metadata>
74  <entry name="name">mtx-unicode</entry>
75  <entry name="detail">Checker for char-dat.lua</entry>
76  <entry name="version">1.02</entry>
77 </metadata>
78 <flags>
79  <category name="basic">
80   <subcategory>
81    <flag name="whatever"><short>do whatever</short></flag>
82   </subcategory>
83  </category>
84 </flags>
85</application>
86]]
87
88local application = logs.application {
89    name     = "mtx-unicode",
90    banner   = "Checker for char-def.lua 1.02",
91    helpinfo = helpinfo,
92}
93
94local gmatch, match, gsub, find, lower, upper, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.upper, string.format
95local concat, sort = table.concat, table.sort
96local split, splitlines, strip = string.split, string.splitlines, string.strip
97local are_equal = table.are_equal
98local tonumber, tostring, rawget = tonumber, tostring, rawget
99local lpegmatch = lpeg.match
100local P, C, S, R, Cs, Ct, Cg, Cf, Cc = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cg, lpeg.Cf, lpeg.Cc
101local formatters = string.formatters
102local utfchar = utf.char
103
104local report = application.report
105
106scripts         = scripts         or { }
107scripts.unicode = scripts.unicode or { }
108
109characters      = characters      or { }
110characters.data = characters.data or { }
111
112fonts           = fonts           or { }
113fonts.encodings = fonts.encodings or { }
114
115local textfiles = { }
116local textdata  = { }
117
118local sparse = false
119
120local split_space_table = lpeg.tsplitat(" ")
121local split_space_two   = lpeg.splitat (" ")
122local split_range_two   = lpeg.splitat ("..")
123local split_colon_table = lpeg.tsplitat(P(" ")^0 * P(";") * P(" ")^0)
124
125local skipped = {
126    [0x002C6] = true, -- MODIFIER LETTER CIRCUMFLEX ACCENT
127    [0x002C7] = true, -- CARON
128}
129
130for i=0x0FE00,0x0FE0F do skipped[i] = true end -- variant selector
131for i=0xE0100,0xE01EF do skipped[i] = true end -- variant selector extension
132
133-- This can be done:
134--
135--   for i=0x1B170,0x1B2FF do skipped[i] = true end -- nushu
136--
137-- but then also adapt char-cjk.lua bottom part!
138
139function scripts.unicode.update()
140    local unicodedata          = texttables.unicodedata
141    local bidimirroring        = texttables.bidimirroring
142    local linebreak            = texttables.linebreak
143    local eastasianwidth       = texttables.eastasianwidth
144    local standardizedvariants = texttables.standardizedvariants
145    local arabicshaping        = texttables.arabicshaping
146    local casefolding          = texttables.casefolding
147    local index                = texttables.index
148    local characterdata        = characters.data
149    --
150    local descriptions         = { }
151    --
152    for unicode, ud in table.sortedpairs(unicodedata) do
153        if not skipped[unicode] then
154            local char = rawget(characterdata,unicode)
155            local description = ud[2] or formatters["UNICODE ENTRY %U"](unicode)
156            if not find(description,"^<") then
157                local ld        = linebreak[unicode]
158                local bd        = bidimirroring[unicode]
159                local ed        = eastasianwidth[unicode]
160                local category  = lower(ud[3] or "?")
161                local combining = tonumber(ud[4])
162                local direction = lower(ud[5] or "l") -- we could omit 'l' being the default
163                local linebreak = ld and lower(ld[2] or "xx")
164                local specials  = ud[6] or ""
165                local cjkwd     = ed and lower(ed[2] or "n")
166                local mirror    = bd and tonumber(bd[2],16)
167                local arabic    = nil
168                local lccode    = false
169                local uccode    = false
170                descriptions[description] = unicode
171                if sparse and direction == "l" then
172                    direction = nil
173                end
174                if linebreak == "xx" then
175                    linebreak = nil
176                end
177                if specials == "" then
178                    specials = nil
179                else
180                    specials = lpegmatch(split_space_table,specials) -- split(specials," ")
181                    if tonumber(specials[1],16) then
182                        for i=#specials,1,-1 do
183                            specials[i+1] = tonumber(specials[i],16)
184                        end
185                        specials[1] = "char"
186                    else
187                        specials[1] = lower(gsub(specials[1],"[<>]",""))
188                        for i=2,#specials do
189                            specials[i] = tonumber(specials[i],16)
190                        end
191                    end
192                end
193                if cjkwd == "n" then
194                    cjkwd = nil
195                end
196                local comment
197                if find(description,"MATHEMATICAL") then
198                    comment = "check math properties"
199                end
200                -- there are more than arabic
201                local as = arabicshaping[unicode]
202                if as then
203                    arabic = lower(as[3])
204                end
205                --
206                if not combining or combining == 0 then
207                    combining = nil
208                end
209                --
210                local cf = casefolding[unicode]
211                if cf and  tonumber(cf[1],16) == unicode then
212                    local how = cf[2]
213                    if how == "C" or how == "S" then
214                        local fold = tonumber(cf[3],16)
215                        if fold == unicode then
216                         -- print("SKIPPING",description)
217                        elseif category == "ll" then
218                            uccode = fold
219                        elseif category == "lu" then
220                            lccode = fold
221                        end
222                    elseif how == "F" then
223                        -- we can use the first
224                        local folding = { }
225                        for s in gmatch(cf[3],"%S+") do
226                            folding[#folding+1] = tonumber(s,16)
227                        end
228                        if category == "ll" then
229                            uccode = folding
230                        elseif category == "ul" then
231                            lccode = folding
232                        end
233                    else
234                        -- we skip these
235                     -- print(description)
236                     -- inspect(cf)
237                    end
238                end
239                --
240                if not char then
241                    report("%U : adding entry %a",unicode,description)
242                    char = {
243                     -- adobename   = ,
244                        category    = category,
245                        comment     = comment,
246                        cjkwd       = cjkwd,
247                        description = description,
248                        direction   = direction,
249                        mirror      = mirror,
250                        linebreak   = linebreak,
251                        unicodeslot = unicode,
252                        specials    = specials,
253                        arabic      = arabic,
254                        combining   = combining,
255                        uccode      = uccode,
256                        lccode      = lccode,
257                    }
258                    characterdata[unicode] = char
259                else
260                    -- we have more case mapping (e.g. cherokee)
261                    if lccode then
262                        if type(lccode) == "table" then
263                            if type(char.lccode) ~= "table" or not are_equal(lccode,char.lccode) then
264                                report("%U : setting lccode to % t, %a",unicode,lccode,description)
265                                char.lccode = lccode
266                            end
267                        elseif char.lccode ~= lccode then
268                            report("%U : setting lccode to %a, %a, %a",unicode,lccode,description)
269                            char.lccode = lccode
270                        end
271                    end
272                    if uccode then
273                        if type(uccode) == "table" then
274                            if type(char.uccode) ~= "table" or not are_equal(uccode,char.uccode) then
275                                report("%U : setting uccode to % t, %a",unicode,uccode,description)
276                                char.uccode = uccode
277                            end
278                        elseif char.uccode ~= uccode then
279                            report("%U : setting uccode to %a, %a",unicode,uccode,description)
280                            char.uccode = uccode
281                        end
282                    end
283                    if direction then
284                        if char.direction ~= direction then
285                            report("%U : setting direction to %a, %a",unicode,direction,description)
286                            char.direction = direction
287                        end
288                    else
289                        if char.direction then
290                            report("%U : resetting direction from %a, %a",unicode,char.direction,description)
291                            char.direction = nil
292                        end
293                    end
294                    if mirror then
295                        if mirror ~= char.mirror then
296                            report("%U : setting mirror to %a, %a",unicode,mirror,description)
297                            char.mirror = mirror
298                        end
299                    else
300                        if char.mirror then
301                            report("%U : resetting mirror from %a, %a",unicode,char.mirror,description)
302                            char.mirror = nil
303                        end
304                    end
305                    if linebreak then
306                        if linebreak ~= char.linebreak then
307                            report("%U : setting linebreak to %a, %a",unicode,linebreak,description)
308                            char.linebreak = linebreak
309                        end
310                    else
311                        if char.linebreak then
312                            report("%U : resetting linebreak from %a, %a",unicode,char.linebreak,description)
313                            char.linebreak = nil
314                        end
315                    end
316                    if cjkwd then
317                        if cjkwd ~= char.cjkwd then
318                            report("%U : setting cjkwd of to %a, %a",unicode,cjkwd,description)
319                            char.cjkwd = cjkwd
320                        end
321                    else
322                        if char.cjkwd then
323                            report("%U : resetting cjkwd of from %a, %a",unicode,char.cjkwd,description)
324                            char.cjkwd = nil
325                        end
326                    end
327                    if arabic then
328                        if arabic ~= char.arabic then
329                            report("%U : setting arabic to %a, %a",unicode,arabic,description)
330                            char.arabic = arabic
331                        end
332                    else
333                        if char.arabic then
334                            report("%U : resetting arabic from %a, %a",unicode,char.arabic,description)
335                            char.arabic = nil
336                        end
337                    end
338                    if combining then
339                        if combining ~= char.combining then
340                            report("%U : setting combining to %a, %a",unicode,combining,description)
341                            char.combining = combining
342                        end
343                    else
344                        if char.combining then
345                            report("%U : resetting combining from %a, %a",unicode,char.combining,description)
346                        end
347                    end
348                    if specials then
349                        if not char.specials or not are_equal(specials,char.specials) then
350                            local t = { specials[1] } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
351                            report("%U : setting specials to % + t, %a",unicode,t,description)
352                            char.specials = specials
353                        end
354                    else
355                        local specials = char.specials
356                        if specials then
357                            local t = { } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
358                            if false then
359                                char.comment = nil
360                                report("%U : resetting specials from % + t, %a",unicode,t,description)
361                            else
362                                local comment = char.comment
363                                if not comment then
364                                    char.comment = "check special"
365                                elseif not find(comment,"check special") then
366                                    char.comment = comment .. ", check special"
367                                end
368                             -- report("%U : check specials % + t, %a",unicode,t,description)
369                            end
370                        end
371                    end
372                end
373                --
374                local visual = char.visual
375                if not visual and find(description,"MATH") then
376                    if find(description,"BOLD ITALIC") then
377                        visual = "bi"
378                    elseif find(description,"ITALIC") then
379                        visual = "it"
380                    elseif find(description,"BOLD") then
381                        visual = "bf"
382                    end
383                    if visual then
384                        report("%U : setting visual to %a, %a",unicode,visual,description)
385                        char.visual = visual
386                    end
387                end
388                -- mathextensible
389                if category == "sm" or (category == "so" and char.mathclass) then
390                    local mathextensible = char.mathextensible
391                    if mathextensible then
392                        -- already done
393                    elseif find(description,"ABOVE") then
394                        -- skip
395                    elseif find(description,"ARROWHEAD") then
396                        -- skip
397                    elseif find(description,"HALFWIDTH") then
398                        -- skip
399                    elseif find(description,"ANGLE") then
400                        -- skip
401                    elseif find(description,"THROUGH") then
402                        -- skip
403                    elseif find(description,"ARROW") then
404                            -- skip
405                        local u = find(description,"UP")
406                        local d = find(description,"DOWN")
407                        local l = find(description,"LEFT")
408                        local r = find(description,"RIGHT")
409                        if find(description,"ARROWHEAD") then
410                            -- skip
411                        elseif find(description,"HALFWIDTH") then
412                            -- skip
413                        elseif u and d then
414                            if l or r then
415                                mathextensible = 'm' -- mixed
416                            else
417                                mathextensible = 'v' -- vertical
418                            end
419                        elseif u then
420                            if l or r then
421                                mathextensible = 'm' -- mixed
422                            else
423                                mathextensible = "u"     -- up
424                            end
425                        elseif d then
426                            if l or r then
427                                mathextensible = 'm' -- mixed
428                            else
429                                mathextensible = "d"     -- down
430                            end
431                        elseif l and r then
432                            mathextensible = "h"     -- horizontal
433                        elseif r then
434                            mathextensible = "r"     -- right
435                        elseif l then
436                            mathextensible = "l"     -- left
437                        end
438                        if mathextensible then
439                            report("%U : setting mathextensible to %a, %a",unicode,mathextensible,description)
440                            char.mathextensible = mathextensible
441                        end
442                    end
443                end
444            end
445        end
446    end
447    -- we need the hash .. add missing specials
448    for unicode, data in table.sortedhash(characterdata) do
449        if not data.specials or data.comment and find(data.comment,"check special") then
450            local description = data.description
451            local b, m = match(description,"^(.+) WITH (.+)$")
452            if b and m and (find(b,"^LATIN") or find (b,"^CYRILLIC")) then
453                local base = descriptions[b]
454                local mark = descriptions[m]
455                if not mark and m == "STROKE" then
456                    mark = descriptions["SOLIDUS"] -- SLASH
457                end
458                if base and mark then
459                 -- report("adding extra char special for %a",description)
460                    data.specials = { "with", base, mark }
461                    data.comment  = nil
462                end
463            end
464        end
465    end
466    --
467    for i=1,#standardizedvariants do
468        local si = standardizedvariants[i]
469        local pair, addendum = si[1], strip(si[2])
470        local first, second = lpegmatch(split_space_two,pair) -- string.splitup(pair," ")
471        first = tonumber(first,16)
472        second = tonumber(second,16)
473        if first then
474            local d = characterdata[first]
475            if d then
476                local v = d.variants
477                local v = rawget(d,"variants")
478                if not v then
479                    v = { }
480                    d.variants = v
481                end
482                if not v[second] then
483                    report("%U : adding variant %U as %s, %a",first,second,addendum,d.description)
484                    v[second] = addendum
485                end
486            end
487        end
488    end
489    for unicode, ud in table.sortedpairs(characterdata) do
490        if not rawget(ud,"category") and rawget(ud,"variants") then
491         -- report("stripping %U (variant, takes from metacharacter)",unicode)
492            characterdata[unicode] = nil
493        end
494    end
495end
496
497local preamble
498
499local function splitdefinition(str,index)
500    local l = splitlines(str)
501    local t = { }
502    if index then
503        for i=1,#l do
504            local s = gsub(l[i]," *#.*$","")
505            if s ~= "" then
506                local d = lpegmatch(split_colon_table,s) -- split(s,";")
507                local o = d[1]
508                local u = tonumber(o,16)
509                if u then
510                    t[u] = d
511                else
512                 -- local b, e = match(o,"^([^%.]+)%.%.([^%.]+)$")
513                    local b, e = lpegmatch(split_range_two,o)
514                    if b and e then
515                        b = tonumber(b,16)
516                        e = tonumber(e,16)
517                        for k=b,e do
518                            t[k] = d
519                        end
520                    else
521                        report("problem: %s",s)
522                    end
523                end
524            end
525        end
526    else
527        local n = 0
528        for i=1,#l do
529            local s = gsub(l[i]," *#.*$","")
530            if s ~= "" then
531                n = n + 1
532                t[n] = lpegmatch(split_colon_table,s) -- split(s,";")
533            end
534        end
535    end
536    return t
537end
538
539local function splitindex(str)
540    -- ok, quick and dirty ... could be a nice lpeg instead
541    local l = splitlines(str)
542    local n = { }
543    for i=1,#l do
544        local a, b, c = match(l[i],"([^%,]+)%,?(.-)\t(.*)")
545        if a and b and c then
546            local name = b .. " " .. a
547            name = strip(name)
548            name = gsub(name,"%s+"," ")
549            n[name] = tonumber(c,16)
550        end
551    end
552    return n
553end
554
555function scripts.unicode.load()
556    local fullname = resolvers.findfile("char-def.lua")
557    report("using: %s",fullname)
558    local data = io.loaddata(fullname)
559    if data then
560        loadstring(data)()
561        --
562        local fullname = resolvers.findfile("char-ini.lua")
563        report("using: %s",fullname)
564        dofile(fullname)
565        --
566        local fullname = resolvers.findfile("char-utf.lua")
567        report("using: %s",fullname)
568        dofile(fullname)
569        --
570        local fullname = resolvers.findfile("char-cjk.lua")
571        report("using: %s",fullname)
572        dofile(fullname)
573        --
574        preamble = gsub(data,"characters%.data%s*=%s*%{.*","")
575        --
576        textfiles = {
577            unicodedata          = resolvers.findfile("unicodedata.txt")          or "",
578            bidimirroring        = resolvers.findfile("bidimirroring.txt")        or "",
579            linebreak            = resolvers.findfile("linebreak.txt")            or "",
580            eastasianwidth       = resolvers.findfile("eastasianwidth.txt")       or "",
581            standardizedvariants = resolvers.findfile("standardizedvariants.txt") or "",
582            arabicshaping        = resolvers.findfile("arabicshaping.txt")        or "",
583            casefolding          = resolvers.findfile("casefolding.txt")          or "",
584            index                = resolvers.findfile("index.txt")                or "",
585        }
586        --
587        textdata = {
588            unicodedata          = textfiles.unicodedata          ~= "" and io.loaddata(textfiles.unicodedata)          or "",
589            bidimirroring        = textfiles.bidimirroring        ~= "" and io.loaddata(textfiles.bidimirroring)        or "",
590            linebreak            = textfiles.linebreak            ~= "" and io.loaddata(textfiles.linebreak)            or "",
591            eastasianwidth       = textfiles.eastasianwidth       ~= "" and io.loaddata(textfiles.eastasianwidth)       or "",
592            standardizedvariants = textfiles.standardizedvariants ~= "" and io.loaddata(textfiles.standardizedvariants) or "",
593            arabicshaping        = textfiles.arabicshaping        ~= "" and io.loaddata(textfiles.arabicshaping)        or "",
594            casefolding          = textfiles.casefolding          ~= "" and io.loaddata(textfiles.casefolding)          or "",
595            index                = textfiles.index                ~= "" and io.loaddata(textfiles.index)                or "",
596        }
597        texttables = {
598            unicodedata          = splitdefinition(textdata.unicodedata,true),
599            bidimirroring        = splitdefinition(textdata.bidimirroring,true),
600            linebreak            = splitdefinition(textdata.linebreak,true),
601            eastasianwidth       = splitdefinition(textdata.eastasianwidth,true),
602            standardizedvariants = splitdefinition(textdata.standardizedvariants,false),
603            arabicshaping        = splitdefinition(textdata.arabicshaping,true),
604            casefolding          = splitdefinition(textdata.casefolding,true),
605            index                = splitindex(textdata.index),
606        }
607        --
608        for k, v in table.sortedhash(textfiles) do
609            report("using: %s",v)
610        end
611        return true
612    else
613        preamble = nil
614        return false
615    end
616end
617
618-- local variants_emoji={
619--   [0xFE0E]="text style",
620--   [0xFE0F]="emoji style",
621-- }
622--
623-- local variants_forms={
624--    [0xFE00]="corner-justified form",
625--    [0xFE01]="centered form",
626-- }
627
628-- local variants_style={
629--    [0xFE00]="chancery style",
630--    [0xFE01]="roundhand style",
631-- }
632
633function scripts.unicode.save(filename)
634    if preamble then
635        local data = table.serialize(characters.data,"characters.data", { hexify = true, noquotes = true })
636        data = gsub(data,"%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}",              "variants_emoji")
637        data = gsub(data,"%{%s+%[0xFE00%]=\"corner%-justified form\",%s+%[0xFE01%]=\"centered form\",%s+%}","variants_forms")
638        data = gsub(data,"%{%s+%[0xFE00%]=\"chancery style\",%s+%[0xFE01%]=\"roundhand style\",%s+%}",      "variants_style")
639        data = gsub(data,"%{%s+%[0xFE00%]=\"dotted form\",%s+%}",                                           "variants_dotted")
640        io.savedata(filename,preamble .. data)
641    end
642end
643
644function scripts.unicode.extras() -- old code
645    --
646    -- 0000..007F; Basic Latin
647    -- 0080..00FF; Latin-1 Supplement
648    -- 0100..017F; Latin Extended-A
649    --
650    local fullname = resolvers.findfile("blocks.txt") or ""
651    if fullname ~= "" then
652        local data   = io.loaddata(fullname)
653        local lines  = splitlines(data)
654        local map    = { }
655        local blocks = characters.blocks
656        local result = { }
657        for i=1,#lines do
658            local line = gsub(lines[i]," *#.*$","")
659            if line ~= "" then
660                local specification = lpegmatch(split_colon_table,line) -- split(s,";")
661                local range         = specification[1]
662                local description   = specification[2]
663                if range and description then
664                    local start, stop = lpegmatch(split_range_two,range)
665                    if start and stop then
666                        local start = tonumber(start,16)
667                        local stop  = tonumber(stop,16)
668                        local name  = gsub(lower(description),"[^a-z]+","")
669                        if start and stop then
670                            local b = blocks[name]
671                            if not b then
672                                result[#result+1] = formatters[ [[+ block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
673                                blocks[name] = { first = start, last = stop, description = description }
674                            elseif b.first ~= start or b.last ~= stop or b.description ~= description then
675                                result[#result+1] = formatters[ [[? block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
676                            end
677                        end
678                        map[#map+1] = name
679                    end
680                end
681            end
682        end
683        sort(result)
684        for i=1,#result do
685            report(result[i])
686        end
687        sort(map)
688        for i=1,#map do
689            local m = map[i]
690            if not blocks[m] then
691                report("obsolete block %a",m)
692            end
693        end
694    end
695    --
696    local index  = texttables.index
697    local blocks = characters.blocks
698    local data   = characters.data
699    for k, v in next, index do
700        if k ~= lower(k) then
701            index[k] = nil
702        end
703    end
704 -- for k, v in next, data do
705 --     v.synonym  = nil
706 --     v.synonyms = nil
707 -- end
708    for k, v in table.sortedhash(index) do
709        local d = data[v]
710        if d and d.description ~= upper(k) then
711            local synonyms = d.synonyms
712            if synonyms then
713                local n = #synonyms
714                local f = false
715                for i=1,n do
716                    if synonyms[i] == k then
717                        f = true
718                        break
719                    end
720                end
721                if not f then
722                    synonyms[n+1] = k
723                end
724             -- synonyms = table.unique(synonyms)
725             -- d.synonyms = synonyms
726                sort(synonyms)
727            else
728                d.synonyms = { k }
729            end
730        end
731    end
732end
733
734do
735
736    local space       = P(" ")
737    local spaces      = space^0
738    local semicolon   = P(";")
739    local hash        = P("#")
740    local newline     = S("\n\r")
741
742    local unicode     = Cs(R("09","AF")^1)/function(n) return tonumber(n,16) end
743                      * spaces
744    local components  = Ct (unicode^1)
745
746 -- local rubish_a    = semicolon
747 --                   * spaces
748 --                   * P("Emoji_ZWJ_Sequence")
749 --                   * spaces
750 --                   * semicolon
751 --                   * spaces
752 -- local description = C((1 - (spaces * (hash+newline)))^1)
753 -- local rubish_b    = (1-newline)^0
754 --                   * newline^1
755 --
756 -- local pattern_1   = Ct ( (
757 --     Cf ( Ct("") *
758 --         Cg (Cc("components") * components)
759 --       * rubish_a
760 --       * Cg (Cc("description") * description )
761 --       * rubish_b
762 --     , rawset)
763 --     + P(1) )^1 )
764
765    local rubish_a    = semicolon
766                      * spaces
767                      * P("non-")^0 * P("fully-qualified")
768                      * spaces
769                      * hash
770                      * spaces
771    local textstring  = C((1 - space)^1)
772                      * spaces
773    local description = ((1 - (spaces * newline))^1) / string.lower
774    local rubish_b    = (1-newline)^0
775                      * newline^1
776
777    local pattern_2   = Ct ( (
778        Cf ( Ct("") *
779            Cg (Cc("components") * components)
780          * rubish_a
781          * Cg (Cc("textstring") * textstring)
782          * Cg (Cc("description") * description )
783          * rubish_b
784        , rawset)
785        + P(1) )^1 )
786
787    function scripts.unicode.emoji(filename)
788
789        local name = resolvers.findfile("emoji-test.txt") or ""
790        if name == "" then
791            return
792        end
793        local l = io.loaddata(name)
794        local t = lpegmatch(pattern_2,l)
795
796        local hash = { }
797
798        local crap = lpeg.P("e") * lpeg.R("09","..","09")^1 * lpeg.P(" ")^1
799
800        local replace = lpeg.replacer {
801            [crap] = "",
802            ["#"]  = "hash",
803            ["*"]  = "asterisk",
804        }
805
806        for i=1,#t do
807            local v = t[i]
808            local d = v.description
809            local k = lpegmatch(replace,d) or d
810            hash[k] = v.components
811        end
812        local new = table.serialize(hash,"return", { hexify = true })
813        local old = io.loaddata(resolvers.findfile("char-emj.lua"))
814        if old and old ~= "" then
815            new = gsub(old,"^(.-)return .*$","%1" .. new)
816        end
817        io.savedata(filename,new)
818    end
819
820end
821
822-- the action
823
824local filename = environment.files[1]
825
826if environment.arguments.exporthelp then
827    application.export(environment.arguments.exporthelp,filename)
828else
829    report("start working on %a, input char-def.lua",lfs.currentdir())
830    if scripts.unicode.load() then
831        scripts.unicode.update()
832        scripts.unicode.extras()
833        scripts.unicode.save("char-def-new.lua")
834        scripts.unicode.emoji("char-emj-new.lua")
835        report("saved file %a","char-def-new.lua")
836        report("saved file %a (current 14.0, check for updates, see above!)","char-emj-new.lua")
837    else
838        report("nothing to do")
839    end
840    report("stop working on %a\n",lfs.currentdir())
841end
842