mtx-unicode.lua /size: 37 Kb    last modification: 2023-12-21 09:43
1if not modules then modules = { } end modules ['mtx-unicode'] = {
2    version   = 1.002,
3    comment   = "companion to mtxrun.lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This is very old code that I started writing in 2005 but occasionally
10-- extended. Don't use it yourself, it's just a sort of reference. The
11-- data that we use in ConTeXt is more extensive.
12--
13-- In my local tree I keep files in places like this:
14--
15--    e:/tex-context/tex/texmf-local/data/unicode/blocks.txt
16--
17-- curl -o arabicshaping.txt             http://www.unicode.org/Public/UNIDATA/ArabicShaping.txt
18-- curl -o bidibrackets.txt              http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
19-- curl -o bidicharactertest.txt         http://www.unicode.org/Public/UNIDATA/BidiCharacterTest.txt
20-- curl -o bidimirroring.txt             http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt
21-- curl -o biditest.txt                  http://www.unicode.org/Public/UNIDATA/BidiTest.txt
22-- curl -o blocks.txt                    http://www.unicode.org/Public/UNIDATA/Blocks.txt
23-- curl -o cjkradicals.txt               http://www.unicode.org/Public/UNIDATA/CJKRadicals.txt
24-- curl -o casefolding.txt               http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
25-- curl -o compositionexclusions.txt     http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
26-- curl -o derivedage.txt                http://www.unicode.org/Public/UNIDATA/DerivedAge.txt
27-- curl -o derivedcoreproperties.txt     http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
28-- curl -o derivednormalizationprops.txt http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
29-- curl -o eastasianwidth.txt            http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
30-- curl -o emojisources.txt              http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
31-- curl -o hangulsyllabletype.txt        http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
32-- curl -o index.txt                     http://www.unicode.org/Public/UNIDATA/Index.txt
33-- curl -o indicpositionalcategory.txt   http://www.unicode.org/Public/UNIDATA/IndicPositionalCategory.txt
34-- curl -o indicsyllabiccategory.txt     http://www.unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt
35-- curl -o jamo.txt                      http://www.unicode.org/Public/UNIDATA/Jamo.txt
36-- curl -o linebreak.txt                 http://www.unicode.org/Public/UNIDATA/LineBreak.txt
37-- curl -o namealiases.txt               http://www.unicode.org/Public/UNIDATA/NameAliases.txt
38-- curl -o namedsequences.txt            http://www.unicode.org/Public/UNIDATA/NamedSequences.txt
39-- curl -o namedsequencesprov.txt        http://www.unicode.org/Public/UNIDATA/NamedSequencesProv.txt
40-- curl -o nameslist.html                http://www.unicode.org/Public/UNIDATA/NamesList.html
41-- curl -o nameslist.txt                 http://www.unicode.org/Public/UNIDATA/NamesList.txt
42-- curl -o normalizationcorrections.txt  http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
43-- curl -o normalizationtest.txt         http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
44-- curl -o proplist.txt                  http://www.unicode.org/Public/UNIDATA/PropList.txt
45-- curl -o propertyaliases.txt           http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
46-- curl -o propertyvaluealiases.txt      http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
47-- curl -o readme.txt                    http://www.unicode.org/Public/UNIDATA/ReadMe.txt
48-- curl -o scriptextensions.txt          http://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
49-- curl -o scripts.txt                   http://www.unicode.org/Public/UNIDATA/Scripts.txt
50-- curl -o specialcasing.txt             http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
51-- curl -o standardizedvariants.html     http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
52-- curl -o standardizedvariants.txt      http://www.unicode.org/Public/UNIDATA/StandardizedVariants.txt
53-- curl -o tangutsources.txt             http://www.unicode.org/Public/UNIDATA/TangutSources.txt
54-- curl -o ucd.zip                       http://www.unicode.org/Public/UNIDATA/UCD.zip
55-- curl -o usourcedata.txt               http://www.unicode.org/Public/UNIDATA/USourceData.txt
56-- curl -o usourceglyphs.pdf             http://www.unicode.org/Public/UNIDATA/USourceGlyphs.pdf
57-- curl -o unicodedata.txt               http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
58-- curl -o unihan.zip                    http://www.unicode.org/Public/UNIDATA/Unihan.zip
59--
60-- curl -o emoji-data.txt                http://unicode.org/Public/emoji/12.0/emoji-data.txt
61-- curl -o emoji-sequences.txt           http://unicode.org/Public/emoji/12.0/emoji-sequences.txt
62-- curl -o emoji-variation-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-variation-sequences.txt
63-- curl -o emoji-zwj-sequences.txt       http://unicode.org/Public/emoji/12.0/emoji-zwj-sequences.txt
64-- curl -o emoji-test.txt                http://unicode.org/Public/emoji/12.0/emoji-test.txt
65--
66-- todo:
67--
68--    specialcasing ?
69
70local helpinfo = [[
71<?xml version="1.0"?>
72<application>
73 <metadata>
74  <entry name="name">mtx-unicode</entry>
75  <entry name="detail">Checker for char-dat.lua</entry>
76  <entry name="version">1.02</entry>
77 </metadata>
78 <flags>
79  <category name="basic">
80   <subcategory>
81    <flag name="whatever"><short>do whatever</short></flag>
82   </subcategory>
83  </category>
84 </flags>
85</application>
86]]
87
88local application = logs.application {
89    name     = "mtx-unicode",
90    banner   = "Checker for char-def.lua 1.02",
91    helpinfo = helpinfo,
92}
93
94local gmatch, match, gsub, find, lower, upper, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.upper, string.format
95local concat, sort = table.concat, table.sort
96local split, splitlines, strip = string.split, string.splitlines, string.strip
97local are_equal = table.are_equal
98local tonumber, tostring, rawget = tonumber, tostring, rawget
99local lpegmatch = lpeg.match
100local P, C, S, R, Cs, Ct, Cg, Cf, Cc = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cg, lpeg.Cf, lpeg.Cc
101local formatters = string.formatters
102local utfchar = utf.char
103
104local report = application.report
105
106scripts         = scripts         or { }
107scripts.unicode = scripts.unicode or { }
108
109characters      = characters      or { }
110characters.data = characters.data or { }
111
112fonts           = fonts           or { }
113fonts.encodings = fonts.encodings or { }
114
115local textfiles = { }
116local textdata  = { }
117
118local sparse = false
119
120local split_space_table = lpeg.tsplitat(" ")
121local split_space_two   = lpeg.splitat (" ")
122local split_range_two   = lpeg.splitat ("..")
123local split_colon_table = lpeg.tsplitat(P(" ")^0 * P(";") * P(" ")^0)
124
125local skipped = {
126    [0x002C6] = true, -- MODIFIER LETTER CIRCUMFLEX ACCENT
127    [0x002C7] = true, -- CARON
128}
129
130for i=0x0FE00,0x0FE0F do skipped[i] = true end -- variant selector
131for i=0xE0100,0xE01EF do skipped[i] = true end -- variant selector extension
132
133-- This can be done:
134--
135--   for i=0x1B170,0x1B2FF do skipped[i] = true end -- nushu
136--
137-- but then also adapt char-cjk.lua bottom part!
138
139function scripts.unicode.update()
140    local unicodedata          = texttables.unicodedata
141    local bidimirroring        = texttables.bidimirroring
142    local linebreak            = texttables.linebreak
143    local eastasianwidth       = texttables.eastasianwidth
144    local standardizedvariants = texttables.standardizedvariants
145    local arabicshaping        = texttables.arabicshaping
146    local casefolding          = texttables.casefolding
147    local index                = texttables.index
148    local characterdata        = characters.data
149    --
150    local descriptions         = { }
151    --
152    for unicode, ud in table.sortedpairs(unicodedata) do
153        if not skipped[unicode] then
154            local char = rawget(characterdata,unicode)
155            local description = ud[2] or formatters["UNICODE ENTRY %U"](unicode)
156            if not find(description,"^<") then
157                local ld        = linebreak[unicode]
158                local bd        = bidimirroring[unicode]
159                local ed        = eastasianwidth[unicode]
160                local category  = lower(ud[3] or "?")
161                local combining = tonumber(ud[4])
162                local direction = lower(ud[5] or "l") -- we could omit 'l' being the default
163                local linebreak = ld and lower(ld[2] or "xx")
164                local specials  = ud[6] or ""
165                local cjkwd     = ed and lower(ed[2] or "n")
166                local mirror    = bd and tonumber(bd[2],16)
167                local arabic    = nil
168                local lccode    = false
169                local uccode    = false
170                descriptions[description] = unicode
171                if sparse and direction == "l" then
172                    direction = nil
173                end
174                if linebreak == "xx" then
175                    linebreak = nil
176                end
177                if specials == "" then
178                    specials = nil
179                else
180                    specials = lpegmatch(split_space_table,specials) -- split(specials," ")
181                    if tonumber(specials[1],16) then
182                        for i=#specials,1,-1 do
183                            specials[i+1] = tonumber(specials[i],16)
184                        end
185                        specials[1] = "char"
186                    else
187                        specials[1] = lower(gsub(specials[1],"[<>]",""))
188                        for i=2,#specials do
189                            specials[i] = tonumber(specials[i],16)
190                        end
191                    end
192                end
193                if cjkwd == "n" then
194                    cjkwd = nil
195                end
196                local comment
197                if find(description,"MATHEMATICAL") then
198                    comment = "check math properties"
199                end
200                -- there are more than arabic
201                local as = arabicshaping[unicode]
202                if as then
203                    arabic = lower(as[3])
204                end
205                --
206                if not combining or combining == 0 then
207                    combining = nil
208                end
209                --
210                local cf = casefolding[unicode]
211                if cf and  tonumber(cf[1],16) == unicode then
212                    local how = cf[2]
213                    if how == "C" or how == "S" then
214                        local fold = tonumber(cf[3],16)
215                        if fold == unicode then
216                         -- print("SKIPPING",description)
217                        elseif category == "ll" then
218                            uccode = fold
219                        elseif category == "lu" then
220                            lccode = fold
221                        end
222                    elseif how == "F" then
223                        -- we can use the first
224                        local folding = { }
225                        for s in gmatch(cf[3],"%S+") do
226                            folding[#folding+1] = tonumber(s,16)
227                        end
228                        if category == "ll" then
229                            uccode = folding
230                        elseif category == "ul" then
231                            lccode = folding
232                        end
233                    else
234                        -- we skip these
235                     -- print(description)
236                     -- inspect(cf)
237                    end
238                end
239                --
240-- if specials and specials[1] == "font" then
241--     specials = nil
242-- end
243                if not char then
244                    report("%U : adding entry %a",unicode,description)
245                    char = {
246                     -- adobename   = ,
247                        category    = category,
248                        comment     = comment,
249                        cjkwd       = cjkwd,
250                        description = description,
251                        direction   = direction,
252                        mirror      = mirror,
253                        linebreak   = linebreak,
254                        unicodeslot = unicode,
255                        specials    = specials,
256                        arabic      = arabic,
257                        combining   = combining,
258                        uccode      = uccode and uccode or nil,
259                        lccode      = lccode and lccode or nil,
260                    }
261                    characterdata[unicode] = char
262                else
263                    -- we have more case mapping (e.g. cherokee)
264                    if lccode then
265                        if type(lccode) == "table" then
266                            if type(char.lccode) ~= "table" or not are_equal(lccode,char.lccode) then
267                                report("%U : setting lccode to % t, %a",unicode,lccode,description)
268                                char.lccode = lccode
269                            end
270                        elseif char.lccode ~= lccode then
271                            report("%U : setting lccode to %a, %a, %a",unicode,lccode,description)
272                            char.lccode = lccode
273                        end
274                    end
275                    if uccode then
276                        if type(uccode) == "table" then
277                            if type(char.uccode) ~= "table" or not are_equal(uccode,char.uccode) then
278                                report("%U : setting uccode to % t, %a",unicode,uccode,description)
279                                char.uccode = uccode
280                            end
281                        elseif char.uccode ~= uccode then
282                            report("%U : setting uccode to %a, %a",unicode,uccode,description)
283                            char.uccode = uccode
284                        end
285                    end
286                    if direction then
287                        if char.direction ~= direction then
288                            report("%U : setting direction to %a, %a",unicode,direction,description)
289                            char.direction = direction
290                        end
291                    else
292                        if char.direction then
293                            report("%U : resetting direction from %a, %a",unicode,char.direction,description)
294                            char.direction = nil
295                        end
296                    end
297                    if mirror then
298                        if mirror ~= char.mirror then
299                            report("%U : setting mirror to %a, %a",unicode,mirror,description)
300                            char.mirror = mirror
301                        end
302                    else
303                        if char.mirror then
304                            report("%U : resetting mirror from %a, %a",unicode,char.mirror,description)
305                            char.mirror = nil
306                        end
307                    end
308                    if linebreak then
309                        if linebreak ~= char.linebreak then
310                            report("%U : setting linebreak to %a, %a",unicode,linebreak,description)
311                            char.linebreak = linebreak
312                        end
313                    else
314                        if char.linebreak then
315                            report("%U : resetting linebreak from %a, %a",unicode,char.linebreak,description)
316                            char.linebreak = nil
317                        end
318                    end
319                    if cjkwd then
320                        if cjkwd ~= char.cjkwd then
321                            report("%U : setting cjkwd of to %a, %a",unicode,cjkwd,description)
322                            char.cjkwd = cjkwd
323                        end
324                    else
325                        if char.cjkwd then
326                            report("%U : resetting cjkwd of from %a, %a",unicode,char.cjkwd,description)
327                            char.cjkwd = nil
328                        end
329                    end
330                    if arabic then
331                        if arabic ~= char.arabic then
332                            report("%U : setting arabic to %a, %a",unicode,arabic,description)
333                            char.arabic = arabic
334                        end
335                    else
336                        if char.arabic then
337                            report("%U : resetting arabic from %a, %a",unicode,char.arabic,description)
338                            char.arabic = nil
339                        end
340                    end
341                    if combining then
342                        if combining ~= char.combining then
343                            report("%U : setting combining to %a, %a",unicode,combining,description)
344                            char.combining = combining
345                        end
346                    else
347                        if char.combining then
348                            report("%U : resetting combining from %a, %a",unicode,char.combining,description)
349                        end
350                    end
351                    if specials then
352                        if not char.specials or not are_equal(specials,char.specials) then
353                            local t = { specials[1] } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
354                            report("%U : setting specials to % + t, %a",unicode,t,description)
355                            char.specials = specials
356                        end
357                    else
358                        local specials = char.specials
359                        if specials then
360-- if specials and specials[1] ~= "font" then
361                            local t = { } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
362                            if false then
363                                char.comment = nil
364                                report("%U : resetting specials from % + t, %a",unicode,t,description)
365                            else
366                                local comment = char.comment
367                                if not comment then
368                                    char.comment = "check special"
369                                elseif not find(comment,"check special") then
370                                    char.comment = comment .. ", check special"
371                                end
372                             -- report("%U : check specials % + t, %a",unicode,t,description)
373                            end
374                        end
375                    end
376                end
377                --
378                local visual = char.visual
379                if not visual and find(description,"MATH") then
380                    if find(description,"BOLD ITALIC") then
381                        visual = "bi"
382                    elseif find(description,"ITALIC") then
383                        visual = "it"
384                    elseif find(description,"BOLD") then
385                        visual = "bf"
386                    end
387                    if visual then
388                        report("%U : setting visual to %a, %a",unicode,visual,description)
389                        char.visual = visual
390                    end
391                end
392                -- mathextensible
393                if category == "sm" or (category == "so" and char.mathclass) then
394                    local mathextensible = char.mathextensible
395                    if mathextensible then
396                        -- already done
397                    elseif find(description,"ABOVE") then
398                        -- skip
399                    elseif find(description,"ARROWHEAD") then
400                        -- skip
401                    elseif find(description,"HALFWIDTH") then
402                        -- skip
403                    elseif find(description,"ANGLE") then
404                        -- skip
405                    elseif find(description,"THROUGH") then
406                        -- skip
407                    elseif find(description,"ARROW") then
408                            -- skip
409                        local u = find(description,"UP")
410                        local d = find(description,"DOWN")
411                        local l = find(description,"LEFT")
412                        local r = find(description,"RIGHT")
413                        if find(description,"ARROWHEAD") then
414                            -- skip
415                        elseif find(description,"HALFWIDTH") then
416                            -- skip
417                        elseif u and d then
418                            if l or r then
419                                mathextensible = 'm' -- mixed
420                            else
421                                mathextensible = 'v' -- vertical
422                            end
423                        elseif u then
424                            if l or r then
425                                mathextensible = 'm' -- mixed
426                            else
427                                mathextensible = "u" -- up
428                            end
429                        elseif d then
430                            if l or r then
431                                mathextensible = 'm' -- mixed
432                            else
433                                mathextensible = "d" -- down
434                            end
435                        elseif l and r then
436                            mathextensible = "h"     -- horizontal
437                        elseif r then
438                            mathextensible = "r"     -- right
439                        elseif l then
440                            mathextensible = "l"     -- left
441                        end
442                        if mathextensible then
443                            report("%U : setting mathextensible to %a, %a",unicode,mathextensible,description)
444                            char.mathextensible = mathextensible
445                        end
446                    end
447                end
448            end
449        end
450    end
451    -- we need the hash .. add missing specials
452    for unicode, data in table.sortedhash(characterdata) do
453        if not data.specials or data.comment and find(data.comment,"check special") then
454            local description = data.description
455            local b, m = match(description,"^(.+) WITH (.+)$")
456            if b and m and (find(b,"^LATIN") or find (b,"^CYRILLIC")) then
457                local base = descriptions[b]
458                local mark = descriptions[m]
459                if not mark and m == "STROKE" then
460                    mark = descriptions["SOLIDUS"] -- SLASH
461                end
462                if base and mark then
463                 -- report("adding extra char special for %a",description)
464                    data.specials = { "with", base, mark }
465                    data.comment  = nil
466                end
467            end
468        end
469    end
470    --
471    for i=1,#standardizedvariants do
472        local si = standardizedvariants[i]
473        local pair, addendum = si[1], strip(si[2])
474        local first, second = lpegmatch(split_space_two,pair) -- string.splitup(pair," ")
475        first = tonumber(first,16)
476        second = tonumber(second,16)
477        if first then
478            local d = characterdata[first]
479            if d then
480             -- local v = d.variants
481                local v = rawget(d,"variants")
482                if not v then
483                    v = { }
484                    d.variants = v
485                end
486                if not v[second] then
487                    report("%U : adding variant %U as %s, %a",first,second,addendum,d.description)
488                    v[second] = addendum
489                end
490            end
491        end
492    end
493    for unicode, ud in table.sortedpairs(characterdata) do
494        if not rawget(ud,"category") and rawget(ud,"variants") then
495         -- report("stripping %U (variant, takes from metacharacter)",unicode)
496            characterdata[unicode] = nil
497        end
498    end
499end
500
501local preamble
502
503local function splitdefinition(filename,str,index)
504    local l = splitlines(str)
505    local t = { }
506    if index then
507        for i=1,#l do
508            local s = gsub(l[i]," *#.*$","")
509            if s ~= "" then
510                local d = lpegmatch(split_colon_table,s) -- split(s,";")
511                local o = d[1]
512                local u = tonumber(o,16)
513                if u then
514                    t[u] = d
515                else
516                 -- local b, e = match(o,"^([^%.]+)%.%.([^%.]+)$")
517                    local b, e = lpegmatch(split_range_two,o)
518                    if b and e then
519                        b = tonumber(b,16)
520                        e = tonumber(e,16)
521                        for k=b,e do
522                            t[k] = d
523                        end
524                    else
525                        report("problem: %i %s => %s",i,filename,s)
526                    end
527                end
528            end
529        end
530    else
531        local n = 0
532        for i=1,#l do
533            local s = gsub(l[i]," *#.*$","")
534            if s ~= "" then
535                n = n + 1
536                t[n] = lpegmatch(split_colon_table,s) -- split(s,";")
537            end
538        end
539    end
540    return t
541end
542
543local function splitindex(str)
544    -- ok, quick and dirty ... could be a nice lpeg instead
545    local l = splitlines(str)
546    local n = { }
547    for i=1,#l do
548        local a, b, c = match(l[i],"([^%,]+)%,?(.-)\t(.*)")
549        if a and b and c then
550            local name = b .. " " .. a
551            name = strip(name)
552            name = gsub(name,"%s+"," ")
553            n[name] = tonumber(c,16)
554        end
555    end
556    return n
557end
558
559function scripts.unicode.load()
560    local fullname = resolvers.findfile("char-def.lua")
561    report("using: %s",fullname)
562    local data = io.loaddata(fullname)
563    if data then
564        loadstring(data)()
565        --
566        local fullname = resolvers.findfile("char-ini.lua")
567        report("using: %s",fullname)
568        dofile(fullname)
569        --
570        local fullname = resolvers.findfile("char-utf.lua")
571        report("using: %s",fullname)
572        dofile(fullname)
573        --
574        local fullname = resolvers.findfile("char-cjk.lua")
575        report("using: %s",fullname)
576        dofile(fullname)
577        --
578        preamble = gsub(data,"characters%.data%s*=%s*%{.*","")
579        --
580        textfiles = {
581            unicodedata          = resolvers.findfile("unicodedata.txt")          or "",
582            bidimirroring        = resolvers.findfile("bidimirroring.txt")        or "",
583            linebreak            = resolvers.findfile("linebreak.txt")            or "",
584            eastasianwidth       = resolvers.findfile("eastasianwidth.txt")       or "",
585            standardizedvariants = resolvers.findfile("standardizedvariants.txt") or "",
586            arabicshaping        = resolvers.findfile("arabicshaping.txt")        or "",
587            casefolding          = resolvers.findfile("casefolding.txt")          or "",
588            index                = resolvers.findfile("index.txt")                or "",
589        }
590        --
591        textdata = {
592            unicodedata          = textfiles.unicodedata          ~= "" and io.loaddata(textfiles.unicodedata)          or "",
593            bidimirroring        = textfiles.bidimirroring        ~= "" and io.loaddata(textfiles.bidimirroring)        or "",
594            linebreak            = textfiles.linebreak            ~= "" and io.loaddata(textfiles.linebreak)            or "",
595            eastasianwidth       = textfiles.eastasianwidth       ~= "" and io.loaddata(textfiles.eastasianwidth)       or "",
596            standardizedvariants = textfiles.standardizedvariants ~= "" and io.loaddata(textfiles.standardizedvariants) or "",
597            arabicshaping        = textfiles.arabicshaping        ~= "" and io.loaddata(textfiles.arabicshaping)        or "",
598            casefolding          = textfiles.casefolding          ~= "" and io.loaddata(textfiles.casefolding)          or "",
599            index                = textfiles.index                ~= "" and io.loaddata(textfiles.index)                or "",
600        }
601        texttables = {
602            unicodedata          = splitdefinition(textfiles.unicodedata,textdata.unicodedata,true),
603            bidimirroring        = splitdefinition(textfiles.bidimirroring,textdata.bidimirroring,true),
604            linebreak            = splitdefinition(textfiles.linebreak,textdata.linebreak,true),
605            eastasianwidth       = splitdefinition(textfiles.eastasianwidth,textdata.eastasianwidth,true),
606            standardizedvariants = splitdefinition(textfiles.standardizedvariants,textdata.standardizedvariants,false),
607            arabicshaping        = splitdefinition(textfiles.arabicshaping,textdata.arabicshaping,true),
608            casefolding          = splitdefinition(textfiles.casefolding,textdata.casefolding,true),
609            index                = splitindex(textdata.index),
610        }
611        --
612        for k, v in table.sortedhash(textfiles) do
613            report("using: %s",v)
614        end
615        return true
616    else
617        preamble = nil
618        return false
619    end
620end
621
622-- local variants_emoji={
623--   [0xFE0E]="text style",
624--   [0xFE0F]="emoji style",
625-- }
626--
627-- local variants_forms={
628--    [0xFE00]="corner-justified form",
629--    [0xFE01]="centered form",
630-- }
631
632-- local variants_style={
633--    [0xFE00]="chancery style",
634--    [0xFE01]="roundhand style",
635-- }
636
637-- local variants_90={
638--    [0xFE00]="rotated 90 degrees",
639-- }
640--
641-- local variants_180={
642--    [0xFE01]="rotated 180 degrees",
643-- }
644--
645-- local variants_270={
646--    [0xFE02]="rotated 270 degrees",
647-- }
648--
649-- local variants_expanded={
650--    [0xFE00]="expanded",
651-- }
652--
653-- local variants_90_180={
654--    [0xFE00]="rotated 90 degrees",
655--    [0xFE01]="rotated 180 degrees",
656-- }
657--
658-- local variants_90_180_270={
659--    [0xFE00]="rotated 90 degrees",
660--    [0xFE01]="rotated 180 degrees",
661--    [0xFE02]="rotated 270 degrees",
662-- }
663--
664-- local variants_180_270={
665--    [0xFE01]="rotated 180 degrees",
666--    [0xFE02]="rotated 270 degrees",
667-- }
668--
669-- local variants_90_270={
670--    [0xFE00]="rotated 90 degrees",
671--    [0xFE02]="rotated 270 degrees",
672-- }
673
674function scripts.unicode.save(filename)
675    if preamble then
676        local data = table.serialize(characters.data,"characters.data", {
677            hexify   = true,
678            noquotes = true,
679        })
680        data = gsub(data,
681            "%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}",
682            "variants_emoji"
683        )
684        data = gsub(data,
685            "%{%s+%[0xFE00%]=\"corner%-justified form\",%s+%[0xFE01%]=\"centered form\",%s+%}",
686            "variants_forms"
687        )
688        data = gsub(data,
689            "%{%s+%[0xFE00%]=\"chancery style\",%s+%[0xFE01%]=\"roundhand style\",%s+%}",
690            "variants_style"
691        )
692        data = gsub(data,
693            "%{%s+%[0xFE00%]=\"dotted form\",%s+%}",
694            "variants_dotted"
695        )
696        data = gsub(data,
697            "%{%s+%[0xFE00%]=\"expanded\",%s+%}",
698            "variants_expanded"
699        )
700        data = gsub(data,
701            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\",%s+%}",
702            "variants_%1"
703        )
704        data = gsub(data,
705            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
706              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
707              "%s+%}",
708            "variants_%1_%2"
709        )
710        data = gsub(data,
711            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
712              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
713              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
714              "%s+%}",
715            "variants_%1_%2_%3"
716        )
717        io.savedata(filename,preamble .. data)
718    end
719end
720
721function scripts.unicode.extras() -- old code
722    --
723    -- 0000..007F; Basic Latin
724    -- 0080..00FF; Latin-1 Supplement
725    -- 0100..017F; Latin Extended-A
726    --
727    local fullname = resolvers.findfile("blocks.txt") or ""
728    if fullname ~= "" then
729        local data   = io.loaddata(fullname)
730        local lines  = splitlines(data)
731        local map    = { }
732        local blocks = characters.blocks
733        local result = { }
734        for i=1,#lines do
735            local line = gsub(lines[i]," *#.*$","")
736            if line ~= "" then
737                local specification = lpegmatch(split_colon_table,line) -- split(s,";")
738                local range         = specification[1]
739                local description   = specification[2]
740                if range and description then
741                    local start, stop = lpegmatch(split_range_two,range)
742                    if start and stop then
743                        local start = tonumber(start,16)
744                        local stop  = tonumber(stop,16)
745                        local name  = gsub(lower(description),"[^a-z]+","")
746                        if start and stop then
747                            local b = blocks[name]
748                            if not b then
749                                result[#result+1] = formatters[ [[+ block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
750                                blocks[name] = { first = start, last = stop, description = description }
751                            elseif b.first ~= start or b.last ~= stop or b.description ~= description then
752                                result[#result+1] = formatters[ [[? block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
753                            end
754                        end
755                        map[#map+1] = name
756                    end
757                end
758            end
759        end
760        sort(result)
761        for i=1,#result do
762            report(result[i])
763        end
764        sort(map)
765        for i=1,#map do
766            local m = map[i]
767            if not blocks[m] then
768                report("obsolete block %a",m)
769            end
770        end
771    end
772    --
773    local index  = texttables.index
774    local blocks = characters.blocks
775    local data   = characters.data
776    for k, v in next, index do
777        if k ~= lower(k) then
778            index[k] = nil
779        end
780    end
781 -- for k, v in next, data do
782 --     v.synonym  = nil
783 --     v.synonyms = nil
784 -- end
785    for k, v in table.sortedhash(index) do
786        local d = data[v]
787        if d and d.description ~= upper(k) then
788            local synonyms = d.synonyms
789            if synonyms then
790                local n = #synonyms
791                local f = false
792                for i=1,n do
793                    if synonyms[i] == k then
794                        f = true
795                        break
796                    end
797                end
798                if not f then
799                    synonyms[n+1] = k
800                end
801             -- synonyms = table.unique(synonyms)
802             -- d.synonyms = synonyms
803                sort(synonyms)
804            else
805                d.synonyms = { k }
806            end
807        end
808    end
809end
810
811do
812
813    local space       = P(" ")
814    local spaces      = space^0
815    local semicolon   = P(";")
816    local hash        = P("#")
817    local newline     = S("\n\r")
818
819    local unicode     = Cs(R("09","AF")^1)/function(n) return tonumber(n,16) end
820                      * spaces
821    local components  = Ct (unicode^1)
822
823 -- local rubish_a    = semicolon
824 --                   * spaces
825 --                   * P("Emoji_ZWJ_Sequence")
826 --                   * spaces
827 --                   * semicolon
828 --                   * spaces
829 -- local description = C((1 - (spaces * (hash+newline)))^1)
830 -- local rubish_b    = (1-newline)^0
831 --                   * newline^1
832 --
833 -- local pattern_1   = Ct ( (
834 --     Cf ( Ct("") *
835 --         Cg (Cc("components") * components)
836 --       * rubish_a
837 --       * Cg (Cc("description") * description )
838 --       * rubish_b
839 --     , rawset)
840 --     + P(1) )^1 )
841
842    local rubish_a    = semicolon
843                      * spaces
844                      * P("non-")^0 * P("fully-qualified")
845                      * spaces
846                      * hash
847                      * spaces
848    local textstring  = C((1 - space)^1)
849                      * spaces
850    local description = ((1 - (spaces * newline))^1) / string.lower
851    local rubish_b    = (1-newline)^0
852                      * newline^1
853
854    local pattern_2   = Ct ( (
855        Cf ( Ct("") *
856            Cg (Cc("components") * components)
857          * rubish_a
858          * Cg (Cc("textstring") * textstring)
859          * Cg (Cc("description") * description )
860          * rubish_b
861        , rawset)
862        + P(1) )^1 )
863
864    function scripts.unicode.emoji(filename)
865
866        local name = resolvers.findfile("emoji-test.txt") or ""
867        if name == "" then
868            return
869        end
870        local l = io.loaddata(name)
871        local t = lpegmatch(pattern_2,l)
872
873        local hash = { }
874
875        local crap = lpeg.P("e") * lpeg.R("09","..","09")^1 * lpeg.P(" ")^1
876
877        local replace = lpeg.replacer {
878            [crap] = "",
879            ["#"]  = "hash",
880            ["*"]  = "asterisk",
881        }
882
883        for i=1,#t do
884            local v = t[i]
885            local d = v.description
886            local k = lpegmatch(replace,d) or d
887            hash[k] = v.components
888        end
889        local new = table.serialize(hash,"return", { hexify = true })
890        local old = io.loaddata(resolvers.findfile("char-emj.lua"))
891        if old and old ~= "" then
892            new = gsub(old,"^(.-)return .*$","%1" .. new)
893        end
894        io.savedata(filename,new)
895    end
896
897end
898
899-- the action
900
901local filename = environment.files[1]
902
903if environment.arguments.exporthelp then
904    application.export(environment.arguments.exporthelp,filename)
905else
906    report("start working on %a, input char-def.lua",lfs.currentdir())
907    if scripts.unicode.load() then
908        scripts.unicode.update()
909        scripts.unicode.extras()
910        scripts.unicode.save("char-def-new.lua")
911        scripts.unicode.emoji("char-emj-new.lua")
912        report("saved file %a","char-def-new.lua")
913        report("saved file %a (current 15.1, check for updates, see above!)","char-emj-new.lua")
914    else
915        report("nothing to do")
916    end
917    report("stop working on %a\n",lfs.currentdir())
918end
919