1if not modules then modules = { } end modules ['mtx-unicode'] = {
2 version = 1.002,
3 comment = "companion to mtxrun.lua",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
7}
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70local helpinfo = [[
71<?xml version="1.0"?>
72<application>
73 <metadata>
74 <entry name="name">mtx-unicode</entry>
75 <entry name="detail">Checker for char-dat.lua</entry>
76 <entry name="version">1.02</entry>
77 </metadata>
78 <flags>
79 <category name="basic">
80 <subcategory>
81 <flag name="whatever"><short>do whatever</short></flag>
82 </subcategory>
83 </category>
84 </flags>
85</application>
86]]
87
88local application = logs.application {
89 name = "mtx-unicode",
90 banner = "Checker for char-def.lua 1.02",
91 helpinfo = helpinfo,
92}
93
94local gmatch, match, gsub, find, lower, upper, format = string.gmatch, string.match, string.gsub, string.find, string.lower, string.upper, string.format
95local concat, sort = table.concat, table.sort
96local split, splitlines, strip = string.split, string.splitlines, string.strip
97local are_equal = table.are_equal
98local tonumber, tostring, rawget = tonumber, tostring, rawget
99local lpegmatch = lpeg.match
100local P, C, S, R, Cs, Ct, Cg, Cf, Cc = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cg, lpeg.Cf, lpeg.Cc
101local formatters = string.formatters
102local utfchar = utf.char
103
104local report = application.report
105
106scripts = scripts or { }
107scripts.unicode = scripts.unicode or { }
108
109characters = characters or { }
110characters.data = characters.data or { }
111
112fonts = fonts or { }
113fonts.encodings = fonts.encodings or { }
114
115local textfiles = { }
116local textdata = { }
117
118local sparse = false
119
120local split_space_table = lpeg.tsplitat(" ")
121local split_space_two = lpeg.splitat (" ")
122local split_range_two = lpeg.splitat ("..")
123local split_colon_table = lpeg.tsplitat(P(" ")^0 * P(";") * P(" ")^0)
124
125local skipped = {
126 [0x002C6] = true,
127 [0x002C7] = true,
128}
129
130for i=0x0FE00,0x0FE0F do skipped[i] = true end
131for i=0xE0100,0xE01EF do skipped[i] = true end
132
133
134
135
136
137
138
139function scripts.unicode.update()
140 local unicodedata = texttables.unicodedata
141 local bidimirroring = texttables.bidimirroring
142 local linebreak = texttables.linebreak
143 local eastasianwidth = texttables.eastasianwidth
144 local standardizedvariants = texttables.standardizedvariants
145 local arabicshaping = texttables.arabicshaping
146 local casefolding = texttables.casefolding
147 local index = texttables.index
148 local characterdata = characters.data
149
150 local descriptions = { }
151
152 for unicode, ud in table.sortedpairs(unicodedata) do
153 if not skipped[unicode] then
154 local char = rawget(characterdata,unicode)
155 local description = ud[2] or formatters["UNICODE ENTRY %U"](unicode)
156 if not find(description,"^<") then
157 local ld = linebreak[unicode]
158 local bd = bidimirroring[unicode]
159 local ed = eastasianwidth[unicode]
160 local category = lower(ud[3] or "?")
161 local combining = tonumber(ud[4])
162 local direction = lower(ud[5] or "l")
163 local linebreak = ld and lower(ld[2] or "xx")
164 local specials = ud[6] or ""
165 local cjkwd = ed and lower(ed[2] or "n")
166 local mirror = bd and tonumber(bd[2],16)
167 local arabic = nil
168 local lccode = false
169 local uccode = false
170 descriptions[description] = unicode
171 if sparse and direction == "l" then
172 direction = nil
173 end
174 if linebreak == "xx" then
175 linebreak = nil
176 end
177 if specials == "" then
178 specials = nil
179 else
180 specials = lpegmatch(split_space_table,specials)
181 if tonumber(specials[1],16) then
182 for i=#specials,1,-1 do
183 specials[i+1] = tonumber(specials[i],16)
184 end
185 specials[1] = "char"
186 else
187 specials[1] = lower(gsub(specials[1],"[<>]",""))
188 for i=2,#specials do
189 specials[i] = tonumber(specials[i],16)
190 end
191 end
192 end
193 if cjkwd == "n" then
194 cjkwd = nil
195 end
196 local comment
197 if find(description,"MATHEMATICAL") then
198 comment = "check math properties"
199 end
200
201 local as = arabicshaping[unicode]
202 if as then
203 arabic = lower(as[3])
204 end
205
206 if not combining or combining == 0 then
207 combining = nil
208 end
209
210 local cf = casefolding[unicode]
211 if cf and tonumber(cf[1],16) == unicode then
212 local how = cf[2]
213 if how == "C" or how == "S" then
214 local fold = tonumber(cf[3],16)
215 if fold == unicode then
216
217 elseif category == "ll" then
218 uccode = fold
219 elseif category == "lu" then
220 lccode = fold
221 end
222 elseif how == "F" then
223
224 local folding = { }
225 for s in gmatch(cf[3],"%S+") do
226 folding[#folding+1] = tonumber(s,16)
227 end
228 if category == "ll" then
229 uccode = folding
230 elseif category == "ul" then
231 lccode = folding
232 end
233 else
234
235
236
237 end
238 end
239
240 if not char then
241 report("%U : adding entry %a",unicode,description)
242 char = {
243
244 category = category,
245 comment = comment,
246 cjkwd = cjkwd,
247 description = description,
248 direction = direction,
249 mirror = mirror,
250 linebreak = linebreak,
251 unicodeslot = unicode,
252 specials = specials,
253 arabic = arabic,
254 combining = combining,
255 uccode = uccode,
256 lccode = lccode,
257 }
258 characterdata[unicode] = char
259 else
260
261 if lccode then
262 if type(lccode) == "table" then
263 if type(char.lccode) ~= "table" or not are_equal(lccode,char.lccode) then
264 report("%U : setting lccode to % t, %a",unicode,lccode,description)
265 char.lccode = lccode
266 end
267 elseif char.lccode ~= lccode then
268 report("%U : setting lccode to %a, %a, %a",unicode,lccode,description)
269 char.lccode = lccode
270 end
271 end
272 if uccode then
273 if type(uccode) == "table" then
274 if type(char.uccode) ~= "table" or not are_equal(uccode,char.uccode) then
275 report("%U : setting uccode to % t, %a",unicode,uccode,description)
276 char.uccode = uccode
277 end
278 elseif char.uccode ~= uccode then
279 report("%U : setting uccode to %a, %a",unicode,uccode,description)
280 char.uccode = uccode
281 end
282 end
283 if direction then
284 if char.direction ~= direction then
285 report("%U : setting direction to %a, %a",unicode,direction,description)
286 char.direction = direction
287 end
288 else
289 if char.direction then
290 report("%U : resetting direction from %a, %a",unicode,char.direction,description)
291 char.direction = nil
292 end
293 end
294 if mirror then
295 if mirror ~= char.mirror then
296 report("%U : setting mirror to %a, %a",unicode,mirror,description)
297 char.mirror = mirror
298 end
299 else
300 if char.mirror then
301 report("%U : resetting mirror from %a, %a",unicode,char.mirror,description)
302 char.mirror = nil
303 end
304 end
305 if linebreak then
306 if linebreak ~= char.linebreak then
307 report("%U : setting linebreak to %a, %a",unicode,linebreak,description)
308 char.linebreak = linebreak
309 end
310 else
311 if char.linebreak then
312 report("%U : resetting linebreak from %a, %a",unicode,char.linebreak,description)
313 char.linebreak = nil
314 end
315 end
316 if cjkwd then
317 if cjkwd ~= char.cjkwd then
318 report("%U : setting cjkwd of to %a, %a",unicode,cjkwd,description)
319 char.cjkwd = cjkwd
320 end
321 else
322 if char.cjkwd then
323 report("%U : resetting cjkwd of from %a, %a",unicode,char.cjkwd,description)
324 char.cjkwd = nil
325 end
326 end
327 if arabic then
328 if arabic ~= char.arabic then
329 report("%U : setting arabic to %a, %a",unicode,arabic,description)
330 char.arabic = arabic
331 end
332 else
333 if char.arabic then
334 report("%U : resetting arabic from %a, %a",unicode,char.arabic,description)
335 char.arabic = nil
336 end
337 end
338 if combining then
339 if combining ~= char.combining then
340 report("%U : setting combining to %a, %a",unicode,combining,description)
341 char.combining = combining
342 end
343 else
344 if char.combining then
345 report("%U : resetting combining from %a, %a",unicode,char.combining,description)
346 end
347 end
348 if specials then
349 if not char.specials or not are_equal(specials,char.specials) then
350 local t = { specials[1] } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
351 report("%U : setting specials to % + t, %a",unicode,t,description)
352 char.specials = specials
353 end
354 else
355 local specials = char.specials
356 if specials then
357 local t = { } for i=2,#specials do t[i] = formatters["%U"](specials[i]) end
358 if false then
359 char.comment = nil
360 report("%U : resetting specials from % + t, %a",unicode,t,description)
361 else
362 local comment = char.comment
363 if not comment then
364 char.comment = "check special"
365 elseif not find(comment,"check special") then
366 char.comment = comment .. ", check special"
367 end
368
369 end
370 end
371 end
372 end
373
374 local visual = char.visual
375 if not visual and find(description,"MATH") then
376 if find(description,"BOLD ITALIC") then
377 visual = "bi"
378 elseif find(description,"ITALIC") then
379 visual = "it"
380 elseif find(description,"BOLD") then
381 visual = "bf"
382 end
383 if visual then
384 report("%U : setting visual to %a, %a",unicode,visual,description)
385 char.visual = visual
386 end
387 end
388
389 if category == "sm" or (category == "so" and char.mathclass) then
390 local mathextensible = char.mathextensible
391 if mathextensible then
392
393 elseif find(description,"ABOVE") then
394
395 elseif find(description,"ARROWHEAD") then
396
397 elseif find(description,"HALFWIDTH") then
398
399 elseif find(description,"ANGLE") then
400
401 elseif find(description,"THROUGH") then
402
403 elseif find(description,"ARROW") then
404
405 local u = find(description,"UP")
406 local d = find(description,"DOWN")
407 local l = find(description,"LEFT")
408 local r = find(description,"RIGHT")
409 if find(description,"ARROWHEAD") then
410
411 elseif find(description,"HALFWIDTH") then
412
413 elseif u and d then
414 if l or r then
415 mathextensible = 'm'
416 else
417 mathextensible = 'v'
418 end
419 elseif u then
420 if l or r then
421 mathextensible = 'm'
422 else
423 mathextensible = "u"
424 end
425 elseif d then
426 if l or r then
427 mathextensible = 'm'
428 else
429 mathextensible = "d"
430 end
431 elseif l and r then
432 mathextensible = "h"
433 elseif r then
434 mathextensible = "r"
435 elseif l then
436 mathextensible = "l"
437 end
438 if mathextensible then
439 report("%U : setting mathextensible to %a, %a",unicode,mathextensible,description)
440 char.mathextensible = mathextensible
441 end
442 end
443 end
444 end
445 end
446 end
447
448 for unicode, data in table.sortedhash(characterdata) do
449 if not data.specials or data.comment and find(data.comment,"check special") then
450 local description = data.description
451 local b, m = match(description,"^(.+) WITH (.+)$")
452 if b and m and (find(b,"^LATIN") or find (b,"^CYRILLIC")) then
453 local base = descriptions[b]
454 local mark = descriptions[m]
455 if not mark and m == "STROKE" then
456 mark = descriptions["SOLIDUS"]
457 end
458 if base and mark then
459
460 data.specials = { "with", base, mark }
461 data.comment = nil
462 end
463 end
464 end
465 end
466
467 for i=1,#standardizedvariants do
468 local si = standardizedvariants[i]
469 local pair, addendum = si[1], strip(si[2])
470 local first, second = lpegmatch(split_space_two,pair)
471 first = tonumber(first,16)
472 second = tonumber(second,16)
473 if first then
474 local d = characterdata[first]
475 if d then
476 local v = d.variants
477 local v = rawget(d,"variants")
478 if not v then
479 v = { }
480 d.variants = v
481 end
482 if not v[second] then
483 report("%U : adding variant %U as %s, %a",first,second,addendum,d.description)
484 v[second] = addendum
485 end
486 end
487 end
488 end
489 for unicode, ud in table.sortedpairs(characterdata) do
490 if not rawget(ud,"category") and rawget(ud,"variants") then
491
492 characterdata[unicode] = nil
493 end
494 end
495end
496
497local preamble
498
499local function splitdefinition(str,index)
500 local l = splitlines(str)
501 local t = { }
502 if index then
503 for i=1,#l do
504 local s = gsub(l[i]," *#.*$","")
505 if s ~= "" then
506 local d = lpegmatch(split_colon_table,s)
507 local o = d[1]
508 local u = tonumber(o,16)
509 if u then
510 t[u] = d
511 else
512
513 local b, e = lpegmatch(split_range_two,o)
514 if b and e then
515 b = tonumber(b,16)
516 e = tonumber(e,16)
517 for k=b,e do
518 t[k] = d
519 end
520 else
521 report("problem: %s",s)
522 end
523 end
524 end
525 end
526 else
527 local n = 0
528 for i=1,#l do
529 local s = gsub(l[i]," *#.*$","")
530 if s ~= "" then
531 n = n + 1
532 t[n] = lpegmatch(split_colon_table,s)
533 end
534 end
535 end
536 return t
537end
538
539local function splitindex(str)
540
541 local l = splitlines(str)
542 local n = { }
543 for i=1,#l do
544 local a, b, c = match(l[i],"([^%,]+)%,?(.-)\t(.*)")
545 if a and b and c then
546 local name = b .. " " .. a
547 name = strip(name)
548 name = gsub(name,"%s+"," ")
549 n[name] = tonumber(c,16)
550 end
551 end
552 return n
553end
554
555function scripts.unicode.load()
556 local fullname = resolvers.findfile("char-def.lua")
557 report("using: %s",fullname)
558 local data = io.loaddata(fullname)
559 if data then
560 loadstring(data)()
561
562 local fullname = resolvers.findfile("char-ini.lua")
563 report("using: %s",fullname)
564 dofile(fullname)
565
566 local fullname = resolvers.findfile("char-utf.lua")
567 report("using: %s",fullname)
568 dofile(fullname)
569
570 local fullname = resolvers.findfile("char-cjk.lua")
571 report("using: %s",fullname)
572 dofile(fullname)
573
574 preamble = gsub(data,"characters%.data%s*=%s*%{.*","")
575
576 textfiles = {
577 unicodedata = resolvers.findfile("unicodedata.txt") or "",
578 bidimirroring = resolvers.findfile("bidimirroring.txt") or "",
579 linebreak = resolvers.findfile("linebreak.txt") or "",
580 eastasianwidth = resolvers.findfile("eastasianwidth.txt") or "",
581 standardizedvariants = resolvers.findfile("standardizedvariants.txt") or "",
582 arabicshaping = resolvers.findfile("arabicshaping.txt") or "",
583 casefolding = resolvers.findfile("casefolding.txt") or "",
584 index = resolvers.findfile("index.txt") or "",
585 }
586
587 textdata = {
588 unicodedata = textfiles.unicodedata ~= "" and io.loaddata(textfiles.unicodedata) or "",
589 bidimirroring = textfiles.bidimirroring ~= "" and io.loaddata(textfiles.bidimirroring) or "",
590 linebreak = textfiles.linebreak ~= "" and io.loaddata(textfiles.linebreak) or "",
591 eastasianwidth = textfiles.eastasianwidth ~= "" and io.loaddata(textfiles.eastasianwidth) or "",
592 standardizedvariants = textfiles.standardizedvariants ~= "" and io.loaddata(textfiles.standardizedvariants) or "",
593 arabicshaping = textfiles.arabicshaping ~= "" and io.loaddata(textfiles.arabicshaping) or "",
594 casefolding = textfiles.casefolding ~= "" and io.loaddata(textfiles.casefolding) or "",
595 index = textfiles.index ~= "" and io.loaddata(textfiles.index) or "",
596 }
597 texttables = {
598 unicodedata = splitdefinition(textdata.unicodedata,true),
599 bidimirroring = splitdefinition(textdata.bidimirroring,true),
600 linebreak = splitdefinition(textdata.linebreak,true),
601 eastasianwidth = splitdefinition(textdata.eastasianwidth,true),
602 standardizedvariants = splitdefinition(textdata.standardizedvariants,false),
603 arabicshaping = splitdefinition(textdata.arabicshaping,true),
604 casefolding = splitdefinition(textdata.casefolding,true),
605 index = splitindex(textdata.index),
606 }
607
608 for k, v in table.sortedhash(textfiles) do
609 report("using: %s",v)
610 end
611 return true
612 else
613 preamble = nil
614 return false
615 end
616end
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633function scripts.unicode.save(filename)
634 if preamble then
635 local data = table.serialize(characters.data,"characters.data", { hexify = true, noquotes = true })
636 data = gsub(data,"%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}", "variants_emoji")
637 data = gsub(data,"%{%s+%[0xFE00%]=\"corner%-justified form\",%s+%[0xFE01%]=\"centered form\",%s+%}","variants_forms")
638 data = gsub(data,"%{%s+%[0xFE00%]=\"chancery style\",%s+%[0xFE01%]=\"roundhand style\",%s+%}", "variants_style")
639 data = gsub(data,"%{%s+%[0xFE00%]=\"dotted form\",%s+%}", "variants_dotted")
640 io.savedata(filename,preamble .. data)
641 end
642end
643
644function scripts.unicode.extras()
645
646
647
648
649
650 local fullname = resolvers.findfile("blocks.txt") or ""
651 if fullname ~= "" then
652 local data = io.loaddata(fullname)
653 local lines = splitlines(data)
654 local map = { }
655 local blocks = characters.blocks
656 local result = { }
657 for i=1,#lines do
658 local line = gsub(lines[i]," *#.*$","")
659 if line ~= "" then
660 local specification = lpegmatch(split_colon_table,line)
661 local range = specification[1]
662 local description = specification[2]
663 if range and description then
664 local start, stop = lpegmatch(split_range_two,range)
665 if start and stop then
666 local start = tonumber(start,16)
667 local stop = tonumber(stop,16)
668 local name = gsub(lower(description),"[^a-z]+","")
669 if start and stop then
670 local b = blocks[name]
671 if not b then
672 result[#result+1] = formatters[ [[+ block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
673 blocks[name] = { first = start, last = stop, description = description }
674 elseif b.first ~= start or b.last ~= stop or b.description ~= description then
675 result[#result+1] = formatters[ [[? block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }]] ](name,start,stop,description)
676 end
677 end
678 map[#map+1] = name
679 end
680 end
681 end
682 end
683 sort(result)
684 for i=1,#result do
685 report(result[i])
686 end
687 sort(map)
688 for i=1,#map do
689 local m = map[i]
690 if not blocks[m] then
691 report("obsolete block %a",m)
692 end
693 end
694 end
695
696 local index = texttables.index
697 local blocks = characters.blocks
698 local data = characters.data
699 for k, v in next, index do
700 if k ~= lower(k) then
701 index[k] = nil
702 end
703 end
704
705
706
707
708 for k, v in table.sortedhash(index) do
709 local d = data[v]
710 if d and d.description ~= upper(k) then
711 local synonyms = d.synonyms
712 if synonyms then
713 local n = #synonyms
714 local f = false
715 for i=1,n do
716 if synonyms[i] == k then
717 f = true
718 break
719 end
720 end
721 if not f then
722 synonyms[n+1] = k
723 end
724
725
726 sort(synonyms)
727 else
728 d.synonyms = { k }
729 end
730 end
731 end
732end
733
734do
735
736 local space = P(" ")
737 local spaces = space^0
738 local semicolon = P(";")
739 local hash = P("#")
740 local newline = S("\n\r")
741
742 local unicode = Cs(R("09","AF")^1)/function(n) return tonumber(n,16) end
743 * spaces
744 local components = Ct (unicode^1)
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765 local rubish_a = semicolon
766 * spaces
767 * P("non-")^0 * P("fully-qualified")
768 * spaces
769 * hash
770 * spaces
771 local textstring = C((1 - space)^1)
772 * spaces
773 local description = ((1 - (spaces * newline))^1) / string.lower
774 local rubish_b = (1-newline)^0
775 * newline^1
776
777 local pattern_2 = Ct ( (
778 Cf ( Ct("") *
779 Cg (Cc("components") * components)
780 * rubish_a
781 * Cg (Cc("textstring") * textstring)
782 * Cg (Cc("description") * description )
783 * rubish_b
784 , rawset)
785 + P(1) )^1 )
786
787 function scripts.unicode.emoji(filename)
788
789 local name = resolvers.findfile("emoji-test.txt") or ""
790 if name == "" then
791 return
792 end
793 local l = io.loaddata(name)
794 local t = lpegmatch(pattern_2,l)
795
796 local hash = { }
797
798 local crap = lpeg.P("e") * lpeg.R("09","..","09")^1 * lpeg.P(" ")^1
799
800 local replace = lpeg.replacer {
801 [crap] = "",
802 ["#"] = "hash",
803 ["*"] = "asterisk",
804 }
805
806 for i=1,#t do
807 local v = t[i]
808 local d = v.description
809 local k = lpegmatch(replace,d) or d
810 hash[k] = v.components
811 end
812 local new = table.serialize(hash,"return", { hexify = true })
813 local old = io.loaddata(resolvers.findfile("char-emj.lua"))
814 if old and old ~= "" then
815 new = gsub(old,"^(.-)return .*$","%1" .. new)
816 end
817 io.savedata(filename,new)
818 end
819
820end
821
822
823
824local filename = environment.files[1]
825
826if environment.arguments.exporthelp then
827 application.export(environment.arguments.exporthelp,filename)
828else
829 report("start working on %a, input char-def.lua",lfs.currentdir())
830 if scripts.unicode.load() then
831 scripts.unicode.update()
832 scripts.unicode.extras()
833 scripts.unicode.save("char-def-new.lua")
834 scripts.unicode.emoji("char-emj-new.lua")
835 report("saved file %a","char-def-new.lua")
836 report("saved file %a (current 14.0, check for updates, see above!)","char-emj-new.lua")
837 else
838 report("nothing to do")
839 end
840 report("stop working on %a\n",lfs.currentdir())
841end
842 |