mtx-unicode.lua /size: 40 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['mtx-unicode'] = {
2    version   = 1.002,
3    comment   = "companion to mtxrun.lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This is very old code that I started writing in 2005 but occasionally
10-- extended. Don't use it yourself, it's just a sort of reference. The
11-- data that we use in ConTeXt is more extensive.
12--
13-- In my local tree I keep files in places like this:
14--
15--    e:/tex-context/tex/texmf-local/data/unicode/blocks.txt
16--
17-- curl -o arabicshaping.txt             http://www.unicode.org/Public/UNIDATA/ArabicShaping.txt
18-- curl -o bidibrackets.txt              http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
19-- curl -o bidicharactertest.txt         http://www.unicode.org/Public/UNIDATA/BidiCharacterTest.txt
20-- curl -o bidimirroring.txt             http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt
21-- curl -o biditest.txt                  http://www.unicode.org/Public/UNIDATA/BidiTest.txt
22-- curl -o blocks.txt                    http://www.unicode.org/Public/UNIDATA/Blocks.txt
23-- curl -o cjkradicals.txt               http://www.unicode.org/Public/UNIDATA/CJKRadicals.txt
24-- curl -o casefolding.txt               http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
25-- curl -o compositionexclusions.txt     http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
26-- curl -o derivedage.txt                http://www.unicode.org/Public/UNIDATA/DerivedAge.txt
27-- curl -o derivedcoreproperties.txt     http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
28-- curl -o derivednormalizationprops.txt http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
29-- curl -o eastasianwidth.txt            http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
30-- curl -o emojisources.txt              http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
31-- curl -o hangulsyllabletype.txt        http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
32-- curl -o index.txt                     http://www.unicode.org/Public/UNIDATA/Index.txt
33-- curl -o indicpositionalcategory.txt   http://www.unicode.org/Public/UNIDATA/IndicPositionalCategory.txt
34-- curl -o indicsyllabiccategory.txt     http://www.unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt
35-- curl -o jamo.txt                      http://www.unicode.org/Public/UNIDATA/Jamo.txt
36-- curl -o linebreak.txt                 http://www.unicode.org/Public/UNIDATA/LineBreak.txt
37-- curl -o namealiases.txt               http://www.unicode.org/Public/UNIDATA/NameAliases.txt
38-- curl -o namedsequences.txt            http://www.unicode.org/Public/UNIDATA/NamedSequences.txt
39-- curl -o namedsequencesprov.txt        http://www.unicode.org/Public/UNIDATA/NamedSequencesProv.txt
40-- curl -o nameslist.html                http://www.unicode.org/Public/UNIDATA/NamesList.html
41-- curl -o nameslist.txt                 http://www.unicode.org/Public/UNIDATA/NamesList.txt
42-- curl -o normalizationcorrections.txt  http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
43-- curl -o normalizationtest.txt         http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
44-- curl -o proplist.txt                  http://www.unicode.org/Public/UNIDATA/PropList.txt
45-- curl -o propertyaliases.txt           http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
46-- curl -o propertyvaluealiases.txt      http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
47-- curl -o readme.txt                    http://www.unicode.org/Public/UNIDATA/ReadMe.txt
48-- curl -o scriptextensions.txt          http://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
49-- curl -o scripts.txt                   http://www.unicode.org/Public/UNIDATA/Scripts.txt
50-- curl -o specialcasing.txt             http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
51-- curl -o standardizedvariants.html     http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
52-- curl -o standardizedvariants.txt      http://www.unicode.org/Public/UNIDATA/StandardizedVariants.txt
53-- curl -o tangutsources.txt             http://www.unicode.org/Public/UNIDATA/TangutSources.txt
54-- curl -o ucd.zip                       http://www.unicode.org/Public/UNIDATA/UCD.zip
55-- curl -o usourcedata.txt               http://www.unicode.org/Public/UNIDATA/USourceData.txt
56-- curl -o usourceglyphs.pdf             http://www.unicode.org/Public/UNIDATA/USourceGlyphs.pdf
57-- curl -o unicodedata.txt               http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
58-- curl -o unihan.zip                    http://www.unicode.org/Public/UNIDATA/Unihan.zip
59--
60-- curl -o emoji-data.txt                http://unicode.org/Public/emoji/12.0/emoji-data.txt
61-- curl -o emoji-sequences.txt           http://unicode.org/Public/emoji/12.0/emoji-sequences.txt
62-- curl -o emoji-variation-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-variation-sequences.txt
63-- curl -o emoji-zwj-sequences.txt       http://unicode.org/Public/emoji/12.0/emoji-zwj-sequences.txt
64-- curl -o emoji-test.txt                http://unicode.org/Public/emoji/12.0/emoji-test.txt
65--
66-- todo:
67--
68--    specialcasing ?
69
70local helpinfo = [[
71<?xml version="1.0"?>
72<application>
73 <metadata>
74  <entry name="name">mtx-unicode</entry>
75  <entry name="detail">Checker for char-dat.lua</entry>
76  <entry name="version">1.02</entry>
77 </metadata>
78 <flags>
79  <category name="basic">
80   <subcategory>
81    <flag name="whatever"><short>do whatever</short></flag>
82   </subcategory>
83  </category>
84 </flags>
85</application>
86]]
87
88local application = logs.application {
89    name     = "mtx-unicode",
90    banner   = "Checker for char-def.lua 1.02",
91    helpinfo = helpinfo,
92}
93
94local gmatch, match, gsub, find, lower, upper, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.upper, string.format
95local concat, sort, sortedhash = table.concat, table.sort, table.sortedhash
96local split, splitlines, strip = string.split, string.splitlines, string.strip
97local are_equal = table.are_equal
98local tonumber, tostring, rawget = tonumber, tostring, rawget
99local lpegmatch = lpeg.match
100local P, C, S, R, Cs, Ct, Cg, Cf, Cc = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cg, lpeg.Cf, lpeg.Cc
101local formatters = string.formatters
102local utfchar = utf.char
103
104local report = application.report
105
106scripts         = scripts         or { }
107scripts.unicode = scripts.unicode or { }
108
109characters      = characters      or { }
110characters.data = characters.data or { }
111
112fonts           = fonts           or { }
113fonts.encodings = fonts.encodings or { }
114
115local textfiles = { }
116local textdata  = { }
117
118local sparse = false
119
120local split_space_table = lpeg.tsplitat(" ")
121local split_space_two   = lpeg.splitat (" ")
122local split_range_two   = lpeg.splitat ("..")
123local split_colon_table = lpeg.tsplitat(P(" ")^0 * P(";") * P(" ")^0)
124
125local skipped = {
126    [0x002C6] = true, -- MODIFIER LETTER CIRCUMFLEX ACCENT
127    [0x002C7] = true, -- CARON
128}
129
130for i=0x0FE00,0x0FE0F do skipped[i] = true end -- variant selector
131for i=0xE0100,0xE01EF do skipped[i] = true end -- variant selector extension
132
133function scripts.unicode.update()
134    local unicodedata          = texttables.unicodedata
135    local bidimirroring        = texttables.bidimirroring
136    local linebreak            = texttables.linebreak
137    local eastasianwidth       = texttables.eastasianwidth
138    local standardizedvariants = texttables.standardizedvariants
139    local arabicshaping        = texttables.arabicshaping
140    local casefolding          = texttables.casefolding
141    local index                = texttables.index
142    local characterdata        = characters.data
143    --
144    local descriptions         = { }
145    --
146    for unicode, ud in table.sortedpairs(unicodedata) do
147        if not skipped[unicode] then
148            local char = rawget(characterdata,unicode)
149            local description = ud[2] or formatters["UNICODE ENTRY %U"](unicode)
150            if not find(description,"^<") then
151                local ld        = linebreak[unicode]
152                local bd        = bidimirroring[unicode]
153                local ed        = eastasianwidth[unicode]
154                local category  = lower(ud[3] or "?")
155                local combining = tonumber(ud[4])
156                local direction = lower(ud[5] or "l") -- we could omit 'l' being the default
157                local linebreak = ld and lower(ld[2] or "xx")
158                local specials  = ud[6] or ""
159                local cjkwd     = ed and lower(ed[2] or "n")
160                local mirror    = bd and tonumber(bd[2],16)
161                local arabic    = nil
162                local lccode    = false
163                local uccode    = false
164                descriptions[description] = unicode
165                if sparse and direction == "l" then
166                    direction = nil
167                end
168                if linebreak == "xx" then
169                    linebreak = nil
170                end
171                if specials == "" then
172                    specials = nil
173                else
174                    specials = lpegmatch(split_space_table,specials) -- split(specials," ")
175                    if tonumber(specials[1],16) then
176                        for i=#specials,1,-1 do
177                            specials[i+1] = tonumber(specials[i],16)
178                        end
179                        specials[1] = "char"
180                    else
181                        specials[1] = lower(gsub(specials[1],"[<>]",""))
182                        for i=2,#specials do
183                            specials[i] = tonumber(specials[i],16)
184                        end
185                    end
186                end
187                if cjkwd == "n" then
188                    cjkwd = nil
189                end
190                local comment
191                if find(description,"MATHEMATICAL") then
192                    comment = "check math properties"
193                end
194                -- there are more than arabic
195                local as = arabicshaping[unicode]
196                if as then
197                    arabic = lower(as[3])
198                end
199                --
200                if not combining or combining == 0 then
201                    combining = nil
202                end
203                --
204                local cf = casefolding[unicode]
205                if cf and  tonumber(cf[1],16) == unicode then
206                    local how = cf[2]
207                    if how == "C" or how == "S" then
208                        local fold = tonumber(cf[3],16)
209                        if fold == unicode then
210                         -- print("SKIPPING",description)
211                        elseif category == "ll" then
212                            uccode = fold
213                        elseif category == "lu" then
214                            lccode = fold
215                        end
216                    elseif how == "F" then
217                        -- we can use the first
218                        local folding = { }
219                        for s in gmatch(cf[3],"%S+") do
220                            folding[#folding+1] = tonumber(s,16)
221                        end
222                        if category == "ll" then
223                            uccode = folding
224                        elseif category == "ul" then
225                            lccode = folding
226                        end
227                    else
228                        -- we skip these
229                     -- print(description)
230                     -- inspect(cf)
231                    end
232                end
233                --
234-- if specials and specials[1] == "font" then
235--     specials = nil
236-- end
237                if not char then
238                    report("%U : adding entry %a",unicode,description)
239                    char = {
240                     -- adobename   = ,
241                        category    = category,
242                        comment     = comment,
243                        cjkwd       = cjkwd,
244                        description = description,
245                        direction   = direction,
246                        mirror      = mirror,
247                        linebreak   = linebreak,
248                        unicodeslot = unicode,
249                        specials    = specials,
250                        arabic      = arabic,
251                        combining   = combining,
252                        uccode      = uccode and uccode or nil,
253                        lccode      = lccode and lccode or nil,
254                    }
255                    characterdata[unicode] = char
256                else
257                    -- we have more case mapping (e.g. cherokee)
258                    if lccode then
259                        if type(lccode) == "table" then
260                            if type(char.lccode) ~= "table" or not are_equal(lccode,char.lccode) then
261                                report("%U : setting lccode to % t, %a",unicode,lccode,description)
262                                char.lccode = lccode
263                            end
264                        elseif char.lccode ~= lccode then
265                            report("%U : setting lccode to %a, %a, %a",unicode,lccode,description)
266                            char.lccode = lccode
267                        end
268                    end
269                    if uccode then
270                        if type(uccode) == "table" then
271                            if type(char.uccode) ~= "table" or not are_equal(uccode,char.uccode) then
272                                report("%U : setting uccode to % t, %a",unicode,uccode,description)
273                                char.uccode = uccode
274                            end
275                        elseif char.uccode ~= uccode then
276                            report("%U : setting uccode to %a, %a",unicode,uccode,description)
277                            char.uccode = uccode
278                        end
279                    end
280                    if direction then
281                        if char.direction ~= direction then
282                            report("%U : setting direction to %a, %a",unicode,direction,description)
283                            char.direction = direction
284                        end
285                    else
286                        if char.direction then
287                            report("%U : resetting direction from %a, %a",unicode,char.direction,description)
288                            char.direction = nil
289                        end
290                    end
291                    if mirror then
292                        if mirror ~= char.mirror then
293                            report("%U : setting mirror to %a, %a",unicode,mirror,description)
294                            char.mirror = mirror
295                        end
296                    else
297                        if char.mirror then
298                            report("%U : resetting mirror from %a, %a",unicode,char.mirror,description)
299                            char.mirror = nil
300                        end
301                    end
302                    if linebreak then
303                        if linebreak ~= char.linebreak then
304                            report("%U : setting linebreak to %a, %a",unicode,linebreak,description)
305                            char.linebreak = linebreak
306                        end
307                    else
308                        if char.linebreak then
309                            report("%U : resetting linebreak from %a, %a",unicode,char.linebreak,description)
310                            char.linebreak = nil
311                        end
312                    end
313                    if cjkwd then
314                        if cjkwd ~= char.cjkwd then
315                            report("%U : setting cjkwd of to %a, %a",unicode,cjkwd,description)
316                            char.cjkwd = cjkwd
317                        end
318                    else
319                        if char.cjkwd then
320                            report("%U : resetting cjkwd of from %a, %a",unicode,char.cjkwd,description)
321                            char.cjkwd = nil
322                        end
323                    end
324                    if arabic then
325                        if arabic ~= char.arabic then
326                            report("%U : setting arabic to %a, %a",unicode,arabic,description)
327                            char.arabic = arabic
328                        end
329                    else
330                        if char.arabic then
331                            report("%U : resetting arabic from %a, %a",unicode,char.arabic,description)
332                            char.arabic = nil
333                        end
334                    end
335                    if combining then
336                        if combining ~= char.combining then
337                            report("%U : setting combining to %a, %a",unicode,combining,description)
338                            char.combining = combining
339                        end
340                    else
341                        if char.combining then
342                            report("%U : resetting combining from %a, %a",unicode,char.combining,description)
343                        end
344                    end
345                    if specials then
346                        if not char.specials or not are_equal(specials,char.specials) then
347                            local t = { specials[1] } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
348                            report("%U : setting specials to % + t, %a",unicode,t,description)
349                            char.specials = specials
350                        end
351                    else
352                        local specials = char.specials
353                        if specials then
354-- if specials and specials[1] ~= "font" then
355                            local t = { } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
356                            if false then
357                                char.comment = nil
358                                report("%U : resetting specials from % + t, %a",unicode,t,description)
359                            else
360                                local comment = char.comment
361                                if not comment then
362                                    char.comment = "check special"
363                                elseif not find(comment,"check special") then
364                                    char.comment = comment .. ", check special"
365                                end
366                             -- report("%U : check specials % + t, %a",unicode,t,description)
367                            end
368                        end
369                    end
370                end
371                --
372                local visual = char.visual
373                if not visual and find(description,"MATH") then
374                    if find(description,"BOLD ITALIC") then
375                        visual = "bi"
376                    elseif find(description,"ITALIC") then
377                        visual = "it"
378                    elseif find(description,"BOLD") then
379                        visual = "bf"
380                    end
381                    if visual then
382                        report("%U : setting visual to %a, %a",unicode,visual,description)
383                        char.visual = visual
384                    end
385                end
386                -- mathextensible
387                if category == "sm" or (category == "so" and char.mathclass) then
388                    local mathextensible = char.mathextensible
389                    if mathextensible then
390                        -- already done
391                    elseif find(description,"ABOVE") then
392                        -- skip
393                    elseif find(description,"ARROWHEAD") then
394                        -- skip
395                    elseif find(description,"HALFWIDTH") then
396                        -- skip
397                    elseif find(description,"ANGLE") then
398                        -- skip
399                    elseif find(description,"THROUGH") then
400                        -- skip
401                    elseif find(description,"ARROW") then
402                            -- skip
403                        local u = find(description,"UP")
404                        local d = find(description,"DOWN")
405                        local l = find(description,"LEFT")
406                        local r = find(description,"RIGHT")
407                        if find(description,"ARROWHEAD") then
408                            -- skip
409                        elseif find(description,"HALFWIDTH") then
410                            -- skip
411                        elseif u and d then
412                            if l or r then
413                                mathextensible = 'm' -- mixed
414                            else
415                                mathextensible = 'v' -- vertical
416                            end
417                        elseif u then
418                            if l or r then
419                                mathextensible = 'm' -- mixed
420                            else
421                                mathextensible = "u" -- up
422                            end
423                        elseif d then
424                            if l or r then
425                                mathextensible = 'm' -- mixed
426                            else
427                                mathextensible = "d" -- down
428                            end
429                        elseif l and r then
430                            mathextensible = "h"     -- horizontal
431                        elseif r then
432                            mathextensible = "r"     -- right
433                        elseif l then
434                            mathextensible = "l"     -- left
435                        end
436                        if mathextensible then
437                            report("%U : setting mathextensible to %a, %a",unicode,mathextensible,description)
438                            char.mathextensible = mathextensible
439                        end
440                    end
441                end
442            end
443        end
444    end
445    --
446    if true then
447--     if false then
448        for i=1,#characters.ranges do
449            local data   = characters.ranges[i]
450            local common = rawget(data, "common")
451            if common then
452                for unicode=data.first,data.last do
453                    local chardata = rawget(characterdata,unicode)
454                    if chardata then
455                        local same = true
456                        for k, v in next, common do
457                            if k == "description" then
458                                -- skip
459                            elseif v == common[k] then
460                                -- okay
461                            else
462                                same = false
463                                break
464                            end
465                        end
466                        if same then
467                            report("%U : wipe %s",unicode,chardata.description)
468                            rawset(characterdata,unicode,nil)
469                        else
470                            report("%U : keep %s",unicode,chardata.description)
471                        end
472                    end
473                end
474            end
475        end
476    end
477    -- we need the hash .. add missing specials
478    for unicode, data in sortedhash(characterdata) do
479        if not data.specials or data.comment and find(data.comment,"check special") then
480            local description = data.description
481            local b, m = match(description,"^(.+) WITH (.+)$")
482            if b and m and (find(b,"^LATIN") or find (b,"^CYRILLIC")) then
483                local base = descriptions[b]
484                local mark = descriptions[m]
485                if not mark and m == "STROKE" then
486                    mark = descriptions["SOLIDUS"] -- SLASH
487                end
488                if base and mark then
489                 -- report("adding extra char special for %a",description)
490                    data.specials = { "with", base, mark }
491                    data.comment  = nil
492                end
493            end
494        end
495    end
496    --
497    for i=1,#standardizedvariants do
498        local si = standardizedvariants[i]
499        local pair, addendum = si[1], strip(si[2])
500        local first, second = lpegmatch(split_space_two,pair) -- string.splitup(pair," ")
501        first = tonumber(first,16)
502        second = tonumber(second,16)
503        if first then
504            local d = characterdata[first]
505            if d then
506             -- local v = d.variants
507                local v = rawget(d,"variants")
508                if not v then
509                    v = { }
510                    d.variants = v
511                end
512                if not v[second] then
513                    report("%U : adding variant %U as %s, %a",first,second,addendum,d.description)
514                    v[second] = addendum
515                end
516            end
517        end
518    end
519    for unicode, ud in table.sortedpairs(characterdata) do
520        if not rawget(ud,"category") and rawget(ud,"variants") then
521         -- report("stripping %U (variant, takes from metacharacter)",unicode)
522            characterdata[unicode] = nil
523        end
524    end
525end
526
527local preamble
528
529local function splitdefinition(filename,str,index)
530    local l = splitlines(str)
531    local t = { }
532    if index then
533        for i=1,#l do
534            local s = gsub(l[i]," *#.*$","")
535            if s ~= "" then
536                local d = lpegmatch(split_colon_table,s) -- split(s,";")
537                local o = d[1]
538                local u = tonumber(o,16)
539                if u then
540                    t[u] = d
541                else
542                 -- local b, e = match(o,"^([^%.]+)%.%.([^%.]+)$")
543                    local b, e = lpegmatch(split_range_two,o)
544                    if b and e then
545                        b = tonumber(b,16)
546                        e = tonumber(e,16)
547                        for k=b,e do
548                            t[k] = d
549                        end
550                    else
551                        report("problem: %i %s => %s",i,filename,s)
552                    end
553                end
554            end
555        end
556    else
557        local n = 0
558        for i=1,#l do
559            local s = gsub(l[i]," *#.*$","")
560            if s ~= "" then
561                n = n + 1
562                t[n] = lpegmatch(split_colon_table,s) -- split(s,";")
563            end
564        end
565    end
566    return t
567end
568
569local function splitindex(str)
570    -- ok, quick and dirty ... could be a nice lpeg instead
571    local l = splitlines(str)
572    local n = { }
573    for i=1,#l do
574        local a, b, c = match(l[i],"([^%,]+)%,?(.-)\t(.*)")
575        if a and b and c then
576            local name = b .. " " .. a
577            name = strip(name)
578            name = gsub(name,"%s+"," ")
579            n[name] = tonumber(c,16)
580        end
581    end
582    return n
583end
584
585function scripts.unicode.load()
586    local fullname = resolvers.findfile("char-def.lua")
587    report("using: %s",fullname)
588    local data = io.loaddata(fullname)
589    if data then
590        loadstring(data)()
591        --
592        local fullname = resolvers.findfile("char-ini.lua")
593        report("using: %s",fullname)
594        dofile(fullname)
595        --
596        local fullname = resolvers.findfile("char-utf.lua")
597        report("using: %s",fullname)
598        dofile(fullname)
599        --
600        local fullname = resolvers.findfile("char-cjk.lua")
601        report("using: %s",fullname)
602        dofile(fullname)
603        --
604        local fullname = resolvers.findfile("char-ran.lua")
605        report("using: %s",fullname)
606        dofile(fullname)
607        --
608        preamble = gsub(data,"characters%.data%s*=%s*%{.*","")
609        --
610        textfiles = {
611            unicodedata          = resolvers.findfile("unicodedata.txt")          or "",
612            bidimirroring        = resolvers.findfile("bidimirroring.txt")        or "",
613            linebreak            = resolvers.findfile("linebreak.txt")            or "",
614            eastasianwidth       = resolvers.findfile("eastasianwidth.txt")       or "",
615            standardizedvariants = resolvers.findfile("standardizedvariants.txt") or "",
616            arabicshaping        = resolvers.findfile("arabicshaping.txt")        or "",
617            casefolding          = resolvers.findfile("casefolding.txt")          or "",
618            index                = resolvers.findfile("index.txt")                or "",
619        }
620        --
621        textdata = {
622            unicodedata          = textfiles.unicodedata          ~= "" and io.loaddata(textfiles.unicodedata)          or "",
623            bidimirroring        = textfiles.bidimirroring        ~= "" and io.loaddata(textfiles.bidimirroring)        or "",
624            linebreak            = textfiles.linebreak            ~= "" and io.loaddata(textfiles.linebreak)            or "",
625            eastasianwidth       = textfiles.eastasianwidth       ~= "" and io.loaddata(textfiles.eastasianwidth)       or "",
626            standardizedvariants = textfiles.standardizedvariants ~= "" and io.loaddata(textfiles.standardizedvariants) or "",
627            arabicshaping        = textfiles.arabicshaping        ~= "" and io.loaddata(textfiles.arabicshaping)        or "",
628            casefolding          = textfiles.casefolding          ~= "" and io.loaddata(textfiles.casefolding)          or "",
629            index                = textfiles.index                ~= "" and io.loaddata(textfiles.index)                or "",
630        }
631        texttables = {
632            unicodedata          = splitdefinition(textfiles.unicodedata,textdata.unicodedata,true),
633            bidimirroring        = splitdefinition(textfiles.bidimirroring,textdata.bidimirroring,true),
634            linebreak            = splitdefinition(textfiles.linebreak,textdata.linebreak,true),
635            eastasianwidth       = splitdefinition(textfiles.eastasianwidth,textdata.eastasianwidth,true),
636            standardizedvariants = splitdefinition(textfiles.standardizedvariants,textdata.standardizedvariants,false),
637            arabicshaping        = splitdefinition(textfiles.arabicshaping,textdata.arabicshaping,true),
638            casefolding          = splitdefinition(textfiles.casefolding,textdata.casefolding,true),
639            index                = splitindex(textdata.index),
640        }
641        --
642        for k, v in sortedhash(textfiles) do
643            report("using: %s",v)
644        end
645        return true
646    else
647        preamble = nil
648        return false
649    end
650end
651
652-- local variants_emoji={
653--   [0xFE0E]="text style",
654--   [0xFE0F]="emoji style",
655-- }
656--
657-- local variants_forms={
658--    [0xFE00]="corner-justified form",
659--    [0xFE01]="centered form",
660-- }
661
662-- local variants_style={
663--    [0xFE00]="chancery style",
664--    [0xFE01]="roundhand style",
665-- }
666
667-- local variants_90={
668--    [0xFE00]="rotated 90 degrees",
669-- }
670--
671-- local variants_180={
672--    [0xFE01]="rotated 180 degrees",
673-- }
674--
675-- local variants_270={
676--    [0xFE02]="rotated 270 degrees",
677-- }
678--
679-- local variants_expanded={
680--    [0xFE00]="expanded",
681-- }
682--
683-- local variants_90_180={
684--    [0xFE00]="rotated 90 degrees",
685--    [0xFE01]="rotated 180 degrees",
686-- }
687--
688-- local variants_90_180_270={
689--    [0xFE00]="rotated 90 degrees",
690--    [0xFE01]="rotated 180 degrees",
691--    [0xFE02]="rotated 270 degrees",
692-- }
693--
694-- local variants_180_270={
695--    [0xFE01]="rotated 180 degrees",
696--    [0xFE02]="rotated 270 degrees",
697-- }
698--
699-- local variants_90_270={
700--    [0xFE00]="rotated 90 degrees",
701--    [0xFE02]="rotated 270 degrees",
702-- }
703
704function scripts.unicode.save(filename)
705    if preamble then
706     -- for k, v in next, characters.data do
707     --     v.adobename = nil
708     -- end
709        --
710        characters.data[0x1FE3].uccode={ 0x3C5, 0x308, 0x301 }
711        characters.data[0x1FD3].uccode={ 0x3B9, 0x308, 0x301 }
712        characters.data[0x00DF].uccode={ 0x53, 0x53 }
713        --
714        local data = table.serialize(characters.data,"characters.data", {
715            hexify   = true,
716            noquotes = true,
717        })
718        data = gsub(data,
719            "%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}",
720            "variants_emoji"
721        )
722        data = gsub(data,
723            "%{%s+%[0xFE00%]=\"corner%-justified form\",%s+%[0xFE01%]=\"centered form\",%s+%}",
724            "variants_forms"
725        )
726        data = gsub(data,
727            "%{%s+%[0xFE00%]=\"chancery style\",%s+%[0xFE01%]=\"roundhand style\",%s+%}",
728            "variants_style"
729        )
730        data = gsub(data,
731            "%{%s+%[0xFE00%]=\"dotted form\",%s+%}",
732            "variants_dotted"
733        )
734        data = gsub(data,
735            "%{%s+%[0xFE00%]=\"expanded\",%s+%}",
736            "variants_expanded"
737        )
738        data = gsub(data,
739            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\",%s+%}",
740            "variants_%1"
741        )
742        data = gsub(data,
743            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
744              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
745              "%s+%}",
746            "variants_%1_%2"
747        )
748        data = gsub(data,
749            "%{%s+%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
750              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
751              "%s*%[0xFE0%d%]=\"rotated (%d+) degrees\"," ..
752              "%s+%}",
753            "variants_%1_%2_%3"
754        )
755        io.savedata(filename,preamble .. data)
756    end
757end
758
759function scripts.unicode.extras() -- old code
760    --
761    -- 0000..007F; Basic Latin
762    -- 0080..00FF; Latin-1 Supplement
763    -- 0100..017F; Latin Extended-A
764    --
765    local fullname = resolvers.findfile("blocks.txt") or ""
766    if fullname ~= "" then
767        local data   = io.loaddata(fullname)
768        local lines  = splitlines(data)
769        local map    = { }
770        local blocks = characters.blocks
771        local result = { }
772        for i=1,#lines do
773            local line = gsub(lines[i]," *#.*$","")
774            if line ~= "" then
775                local specification = lpegmatch(split_colon_table,line) -- split(s,";")
776                local range         = specification[1]
777                local description   = specification[2]
778                if range and description then
779                    local start, stop = lpegmatch(split_range_two,range)
780                    if start and stop then
781                        local start = tonumber(start,16)
782                        local stop  = tonumber(stop,16)
783                        local name  = gsub(lower(description),"[^a-z]+","")
784                        if start and stop then
785                            local b = blocks[name]
786                            if not b then
787                                result[#result+1] = formatters[ [[+ block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
788                                blocks[name] = { first = start, last = stop, description = description }
789                            elseif b.first ~= start or b.last ~= stop or b.description ~= description then
790                                result[#result+1] = formatters[ [[? block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
791                            end
792                        end
793                        map[#map+1] = name
794                    end
795                end
796            end
797        end
798        sort(result)
799        for i=1,#result do
800            report(result[i])
801        end
802        sort(map)
803        for i=1,#map do
804            local m = map[i]
805            if not blocks[m] then
806                report("obsolete block %a",m)
807            end
808        end
809    end
810    --
811    local index  = texttables.index
812    local blocks = characters.blocks
813    local data   = characters.data
814    for k, v in next, index do
815        if k ~= lower(k) then
816            index[k] = nil
817        end
818    end
819 -- for k, v in next, data do
820 --     v.synonym  = nil
821 --     v.synonyms = nil
822 -- end
823    for k, v in sortedhash(index) do
824        local d = data[v]
825        if d and d.description ~= upper(k) then
826            local synonyms = d.synonyms
827            if synonyms then
828                local n = #synonyms
829                local f = false
830                for i=1,n do
831                    if synonyms[i] == k then
832                        f = true
833                        break
834                    end
835                end
836                if not f then
837                    synonyms[n+1] = k
838                end
839             -- synonyms = table.unique(synonyms)
840             -- d.synonyms = synonyms
841                sort(synonyms)
842            else
843                d.synonyms = { k }
844            end
845        end
846    end
847--     --
848--     for name, block in sortedhash(characters.blocks) do
849--         if block.math then
850--             local isalphabet = find(name,"lowercase") or find(name,"uppercase") or find(name,"letterlike")
851--             for unicode=block.first,block.last do
852--                 local c = data[unicode]
853-- if c.mathspec and #c.mathspec == 0 then
854--     c.mathspec = { c.mathspec }
855-- end
856--                 if c.mathclass then
857--                     if isalphabet and c.mathclass ~= "variable" then
858--                         report("CHECK %C : %s",unicode,c.description)
859--                     end
860--                 elseif c.mathspec then
861--                     -- skip
862--                 else
863--                     report("%s : %C : %s",name,unicode,c.description)
864--                     if isalphabet then
865--                         c.mathclass = "variable"
866--                     end
867--                 end
868--             end
869--             local gaps = block.gaps
870--             if gaps then
871--                 for gap, unicode in sortedhash(gaps) do
872--                     local c = data[u]
873-- if c.mathspec and #c.mathspec == 0 then
874--     c.mathspec = { c.mathspec }
875-- end
876--                     if c.mathclass then
877--                         if isalphabet and c.mathclass ~= "variable" then
878--                             report("CHECK %C : %s",gap,c.description)
879--                         end
880--                         -- skip
881--                     elseif c.mathspec then
882--                         -- skip
883--                     else
884--                         report("%s : %U -> %C : %s",name,gap,unicode,c.description)
885--                         if isalphabet then
886--                             c.mathclass = "variable"
887--                         end
888--                     end
889--                 end
890--             end
891--         end
892--     end
893
894end
895
896do
897
898    local space       = P(" ")
899    local spaces      = space^0
900    local semicolon   = P(";")
901    local hash        = P("#")
902    local newline     = S("\n\r")
903
904    local unicode     = Cs(R("09","AF")^1)/function(n) return tonumber(n,16) end
905                      * spaces
906    local components  = Ct (unicode^1)
907
908 -- local rubish_a    = semicolon
909 --                   * spaces
910 --                   * P("Emoji_ZWJ_Sequence")
911 --                   * spaces
912 --                   * semicolon
913 --                   * spaces
914 -- local description = C((1 - (spaces * (hash+newline)))^1)
915 -- local rubish_b    = (1-newline)^0
916 --                   * newline^1
917 --
918 -- local pattern_1   = Ct ( (
919 --     Cf ( Ct("") *
920 --         Cg (Cc("components") * components)
921 --       * rubish_a
922 --       * Cg (Cc("description") * description )
923 --       * rubish_b
924 --     , rawset)
925 --     + P(1) )^1 )
926
927    local rubish_a    = semicolon
928                      * spaces
929                      * P("non-")^0 * P("fully-qualified")
930                      * spaces
931                      * hash
932                      * spaces
933    local textstring  = C((1 - space)^1)
934                      * spaces
935    local description = ((1 - (spaces * newline))^1) / string.lower
936    local rubish_b    = (1-newline)^0
937                      * newline^1
938
939    local pattern_2   = Ct ( (
940        Cf ( Ct("") *
941            Cg (Cc("components") * components)
942          * rubish_a
943          * Cg (Cc("textstring") * textstring)
944          * Cg (Cc("description") * description )
945          * rubish_b
946        , rawset)
947        + P(1) )^1 )
948
949    function scripts.unicode.emoji(filename)
950
951        local name = resolvers.findfile("emoji-test.txt") or ""
952        if name == "" then
953            return
954        end
955        local l = io.loaddata(name)
956        local t = lpegmatch(pattern_2,l)
957
958        local hash = { }
959
960        local crap = lpeg.P("e") * lpeg.R("09","..","09")^1 * lpeg.P(" ")^1
961
962        local replace = lpeg.replacer {
963            [crap] = "",
964            ["#"]  = "hash",
965            ["*"]  = "asterisk",
966        }
967
968        for i=1,#t do
969            local v = t[i]
970            local d = v.description
971            local k = lpegmatch(replace,d) or d
972            hash[k] = v.components
973        end
974        local new = table.serialize(hash,"return", { hexify = true })
975        local old = io.loaddata(resolvers.findfile("char-emj.lua"))
976        if old and old ~= "" then
977            new = gsub(old,"^(.-)return .*$","%1" .. new)
978        end
979        io.savedata(filename,new)
980    end
981
982end
983
984-- the action
985
986local filename = environment.files[1]
987
988if environment.arguments.exporthelp then
989    application.export(environment.arguments.exporthelp,filename)
990else
991    report("start working on %a, input char-def.lua",lfs.currentdir())
992    if scripts.unicode.load() then
993        scripts.unicode.update()
994        scripts.unicode.extras()
995        scripts.unicode.save("char-def-new.lua")
996        scripts.unicode.emoji("char-emj-new.lua")
997        report("saved file %a","char-def-new.lua")
998        report("saved file %a (current 15.1, check for updates, see above!)","char-emj-new.lua")
999    else
1000        report("nothing to do")
1001    end
1002    report("stop working on %a\n",lfs.currentdir())
1003end
1004