mtx-unicode.lua /size: 35 Kb    last modification: 2021-10-28 13:50
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
mtx-unicode
'
]
=
{
2
version
=
1
.
002
,
3
comment
=
"
companion to mtxrun.lua
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
-- This is very old code that I started writing in 2005 but occasionally
10
-- extended. Don't use it yourself, it's just a sort of reference. The
11
-- data that we use in ConTeXt is more extensive.
12
--
13
-- In my local tree I keep files in places like this:
14
--
15
-- e:/tex-context/tex/texmf-local/data/unicode/blocks.txt
16
--
17
-- curl -o arabicshaping.txt http://www.unicode.org/Public/UNIDATA/ArabicShaping.txt
18
-- curl -o bidibrackets.txt http://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
19
-- curl -o bidicharactertest.txt http://www.unicode.org/Public/UNIDATA/BidiCharacterTest.txt
20
-- curl -o bidimirroring.txt http://www.unicode.org/Public/UNIDATA/BidiMirroring.txt
21
-- curl -o biditest.txt http://www.unicode.org/Public/UNIDATA/BidiTest.txt
22
-- curl -o blocks.txt http://www.unicode.org/Public/UNIDATA/Blocks.txt
23
-- curl -o cjkradicals.txt http://www.unicode.org/Public/UNIDATA/CJKRadicals.txt
24
-- curl -o casefolding.txt http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
25
-- curl -o compositionexclusions.txt http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
26
-- curl -o derivedage.txt http://www.unicode.org/Public/UNIDATA/DerivedAge.txt
27
-- curl -o derivedcoreproperties.txt http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
28
-- curl -o derivednormalizationprops.txt http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
29
-- curl -o eastasianwidth.txt http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
30
-- curl -o emojisources.txt http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
31
-- curl -o hangulsyllabletype.txt http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt
32
-- curl -o index.txt http://www.unicode.org/Public/UNIDATA/Index.txt
33
-- curl -o indicpositionalcategory.txt http://www.unicode.org/Public/UNIDATA/IndicPositionalCategory.txt
34
-- curl -o indicsyllabiccategory.txt http://www.unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt
35
-- curl -o jamo.txt http://www.unicode.org/Public/UNIDATA/Jamo.txt
36
-- curl -o linebreak.txt http://www.unicode.org/Public/UNIDATA/LineBreak.txt
37
-- curl -o namealiases.txt http://www.unicode.org/Public/UNIDATA/NameAliases.txt
38
-- curl -o namedsequences.txt http://www.unicode.org/Public/UNIDATA/NamedSequences.txt
39
-- curl -o namedsequencesprov.txt http://www.unicode.org/Public/UNIDATA/NamedSequencesProv.txt
40
-- curl -o nameslist.html http://www.unicode.org/Public/UNIDATA/NamesList.html
41
-- curl -o nameslist.txt http://www.unicode.org/Public/UNIDATA/NamesList.txt
42
-- curl -o normalizationcorrections.txt http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
43
-- curl -o normalizationtest.txt http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
44
-- curl -o proplist.txt http://www.unicode.org/Public/UNIDATA/PropList.txt
45
-- curl -o propertyaliases.txt http://www.unicode.org/Public/UNIDATA/PropertyAliases.txt
46
-- curl -o propertyvaluealiases.txt http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
47
-- curl -o readme.txt http://www.unicode.org/Public/UNIDATA/ReadMe.txt
48
-- curl -o scriptextensions.txt http://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt
49
-- curl -o scripts.txt http://www.unicode.org/Public/UNIDATA/Scripts.txt
50
-- curl -o specialcasing.txt http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
51
-- curl -o standardizedvariants.html http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
52
-- curl -o standardizedvariants.txt http://www.unicode.org/Public/UNIDATA/StandardizedVariants.txt
53
-- curl -o tangutsources.txt http://www.unicode.org/Public/UNIDATA/TangutSources.txt
54
-- curl -o ucd.zip http://www.unicode.org/Public/UNIDATA/UCD.zip
55
-- curl -o usourcedata.txt http://www.unicode.org/Public/UNIDATA/USourceData.txt
56
-- curl -o usourceglyphs.pdf http://www.unicode.org/Public/UNIDATA/USourceGlyphs.pdf
57
-- curl -o unicodedata.txt http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
58
-- curl -o unihan.zip http://www.unicode.org/Public/UNIDATA/Unihan.zip
59
--
60
-- curl -o emoji-data.txt http://unicode.org/Public/emoji/12.0/emoji-data.txt
61
-- curl -o emoji-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-sequences.txt
62
-- curl -o emoji-variation-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-variation-sequences.txt
63
-- curl -o emoji-zwj-sequences.txt http://unicode.org/Public/emoji/12.0/emoji-zwj-sequences.txt
64
-- curl -o emoji-test.txt http://unicode.org/Public/emoji/12.0/emoji-test.txt
65
--
66
-- todo:
67
--
68
-- specialcasing ?
69 70
local
helpinfo
=
[[
71<?xml version="1.0"?> 72<application> 73 <metadata> 74 <entry name="name">mtx-unicode</entry> 75 <entry name="detail">Checker for char-dat.lua</entry> 76 <entry name="version">1.02</entry> 77 </metadata> 78 <flags> 79 <category name="basic"> 80 <subcategory> 81 <flag name="whatever"><short>do whatever</short></flag> 82 </subcategory> 83 </category> 84 </flags> 85</application> 86
]]
87 88
local
application
=
logs
.
application
{
89
name
=
"
mtx-unicode
"
,
90
banner
=
"
Checker for char-def.lua 1.02
"
,
91
helpinfo
=
helpinfo
,
92
}
93 94
local
gmatch
,
match
,
gsub
,
find
,
lower
,
upper
,
format
=
string
.
gmatch
,
string
.
match
,
string
.
gsub
,
string
.
find
,
string
.
lower
,
string
.
upper
,
string
.
format
95
local
concat
,
sort
=
table
.
concat
,
table
.
sort
96
local
split
,
splitlines
,
strip
=
string
.
split
,
string
.
splitlines
,
string
.
strip
97
local
are_equal
=
table
.
are_equal
98
local
tonumber
,
tostring
,
rawget
=
tonumber
,
tostring
,
rawget
99
local
lpegmatch
=
lpeg
.
match
100
local
P
,
C
,
S
,
R
,
Cs
,
Ct
,
Cg
,
Cf
,
Cc
=
lpeg
.
P
,
lpeg
.
C
,
lpeg
.
S
,
lpeg
.
R
,
lpeg
.
Cs
,
lpeg
.
Ct
,
lpeg
.
Cg
,
lpeg
.
Cf
,
lpeg
.
Cc
101
local
formatters
=
string
.
formatters
102
local
utfchar
=
utf
.
char
103 104
local
report
=
application
.
report
105 106
scripts
=
scripts
or
{
}
107
scripts
.
unicode
=
scripts
.
unicode
or
{
}
108 109
characters
=
characters
or
{
}
110
characters
.
data
=
characters
.
data
or
{
}
111 112
fonts
=
fonts
or
{
}
113
fonts
.
encodings
=
fonts
.
encodings
or
{
}
114 115
local
textfiles
=
{
}
116
local
textdata
=
{
}
117 118
local
sparse
=
false
119 120
local
split_space_table
=
lpeg
.
tsplitat
(
"
"
)
121
local
split_space_two
=
lpeg
.
splitat
(
"
"
)
122
local
split_range_two
=
lpeg
.
splitat
(
"
..
"
)
123
local
split_colon_table
=
lpeg
.
tsplitat
(
P
(
"
"
)
^
0
*
P
(
"
;
"
)
*
P
(
"
"
)
^
0
)
124 125
local
skipped
=
{
126
[
0x002C6
]
=
true
,
-- MODIFIER LETTER CIRCUMFLEX ACCENT
127
[
0x002C7
]
=
true
,
-- CARON
128
}
129 130
for
i
=
0x0FE00
,
0x0FE0F
do
skipped
[
i
]
=
true
end
-- variant selector
131
for
i
=
0xE0100
,
0xE01EF
do
skipped
[
i
]
=
true
end
-- variant selector extension
132 133
-- This can be done:
134
--
135
-- for i=0x1B170,0x1B2FF do skipped[i] = true end -- nushu
136
--
137
-- but then also adapt char-cjk.lua bottom part!
138 139
function
scripts
.
unicode
.
update
(
)
140
local
unicodedata
=
texttables
.
unicodedata
141
local
bidimirroring
=
texttables
.
bidimirroring
142
local
linebreak
=
texttables
.
linebreak
143
local
eastasianwidth
=
texttables
.
eastasianwidth
144
local
standardizedvariants
=
texttables
.
standardizedvariants
145
local
arabicshaping
=
texttables
.
arabicshaping
146
local
casefolding
=
texttables
.
casefolding
147
local
index
=
texttables
.
index
148
local
characterdata
=
characters
.
data
149
--
150
local
descriptions
=
{
}
151
--
152
for
unicode
,
ud
in
table
.
sortedpairs
(
unicodedata
)
do
153
if
not
skipped
[
unicode
]
then
154
local
char
=
rawget
(
characterdata
,
unicode
)
155
local
description
=
ud
[
2
]
or
formatters
[
"
UNICODE ENTRY %U
"
]
(
unicode
)
156
if
not
find
(
description
,
"
^<
"
)
then
157
local
ld
=
linebreak
[
unicode
]
158
local
bd
=
bidimirroring
[
unicode
]
159
local
ed
=
eastasianwidth
[
unicode
]
160
local
category
=
lower
(
ud
[
3
]
or
"
?
"
)
161
local
combining
=
tonumber
(
ud
[
4
]
)
162
local
direction
=
lower
(
ud
[
5
]
or
"
l
"
)
-- we could omit 'l' being the default
163
local
linebreak
=
ld
and
lower
(
ld
[
2
]
or
"
xx
"
)
164
local
specials
=
ud
[
6
]
or
"
"
165
local
cjkwd
=
ed
and
lower
(
ed
[
2
]
or
"
n
"
)
166
local
mirror
=
bd
and
tonumber
(
bd
[
2
]
,
16
)
167
local
arabic
=
nil
168
local
lccode
=
false
169
local
uccode
=
false
170
descriptions
[
description
]
=
unicode
171
if
sparse
and
direction
=
=
"
l
"
then
172
direction
=
nil
173
end
174
if
linebreak
=
=
"
xx
"
then
175
linebreak
=
nil
176
end
177
if
specials
=
=
"
"
then
178
specials
=
nil
179
else
180
specials
=
lpegmatch
(
split_space_table
,
specials
)
-- split(specials," ")
181
if
tonumber
(
specials
[
1
]
,
16
)
then
182
for
i
=
#
specials
,
1
,
-1
do
183
specials
[
i
+
1
]
=
tonumber
(
specials
[
i
]
,
16
)
184
end
185
specials
[
1
]
=
"
char
"
186
else
187
specials
[
1
]
=
lower
(
gsub
(
specials
[
1
]
,
"
[<>]
"
,
"
"
)
)
188
for
i
=
2
,
#
specials
do
189
specials
[
i
]
=
tonumber
(
specials
[
i
]
,
16
)
190
end
191
end
192
end
193
if
cjkwd
=
=
"
n
"
then
194
cjkwd
=
nil
195
end
196
local
comment
197
if
find
(
description
,
"
MATHEMATICAL
"
)
then
198
comment
=
"
check math properties
"
199
end
200
-- there are more than arabic
201
local
as
=
arabicshaping
[
unicode
]
202
if
as
then
203
arabic
=
lower
(
as
[
3
]
)
204
end
205
--
206
if
not
combining
or
combining
=
=
0
then
207
combining
=
nil
208
end
209
--
210
local
cf
=
casefolding
[
unicode
]
211
if
cf
and
tonumber
(
cf
[
1
]
,
16
)
=
=
unicode
then
212
local
how
=
cf
[
2
]
213
if
how
=
=
"
C
"
or
how
=
=
"
S
"
then
214
local
fold
=
tonumber
(
cf
[
3
]
,
16
)
215
if
fold
=
=
unicode
then
216
-- print("SKIPPING",description)
217
elseif
category
=
=
"
ll
"
then
218
uccode
=
fold
219
elseif
category
=
=
"
lu
"
then
220
lccode
=
fold
221
end
222
elseif
how
=
=
"
F
"
then
223
-- we can use the first
224
local
folding
=
{
}
225
for
s
in
gmatch
(
cf
[
3
]
,
"
%S+
"
)
do
226
folding
[
#
folding
+
1
]
=
tonumber
(
s
,
16
)
227
end
228
if
category
=
=
"
ll
"
then
229
uccode
=
folding
230
elseif
category
=
=
"
ul
"
then
231
lccode
=
folding
232
end
233
else
234
-- we skip these
235
-- print(description)
236
-- inspect(cf)
237
end
238
end
239
--
240
if
not
char
then
241
report
(
"
%U : adding entry %a
"
,
unicode
,
description
)
242
char
=
{
243
-- adobename = ,
244
category
=
category
,
245
comment
=
comment
,
246
cjkwd
=
cjkwd
,
247
description
=
description
,
248
direction
=
direction
,
249
mirror
=
mirror
,
250
linebreak
=
linebreak
,
251
unicodeslot
=
unicode
,
252
specials
=
specials
,
253
arabic
=
arabic
,
254
combining
=
combining
,
255
uccode
=
uccode
,
256
lccode
=
lccode
,
257
}
258
characterdata
[
unicode
]
=
char
259
else
260
-- we have more case mapping (e.g. cherokee)
261
if
lccode
then
262
if
type
(
lccode
)
=
=
"
table
"
then
263
if
type
(
char
.
lccode
)
~
=
"
table
"
or
not
are_equal
(
lccode
,
char
.
lccode
)
then
264
report
(
"
%U : setting lccode to % t, %a
"
,
unicode
,
lccode
,
description
)
265
char
.
lccode
=
lccode
266
end
267
elseif
char
.
lccode
~
=
lccode
then
268
report
(
"
%U : setting lccode to %a, %a, %a
"
,
unicode
,
lccode
,
description
)
269
char
.
lccode
=
lccode
270
end
271
end
272
if
uccode
then
273
if
type
(
uccode
)
=
=
"
table
"
then
274
if
type
(
char
.
uccode
)
~
=
"
table
"
or
not
are_equal
(
uccode
,
char
.
uccode
)
then
275
report
(
"
%U : setting uccode to % t, %a
"
,
unicode
,
uccode
,
description
)
276
char
.
uccode
=
uccode
277
end
278
elseif
char
.
uccode
~
=
uccode
then
279
report
(
"
%U : setting uccode to %a, %a
"
,
unicode
,
uccode
,
description
)
280
char
.
uccode
=
uccode
281
end
282
end
283
if
direction
then
284
if
char
.
direction
~
=
direction
then
285
report
(
"
%U : setting direction to %a, %a
"
,
unicode
,
direction
,
description
)
286
char
.
direction
=
direction
287
end
288
else
289
if
char
.
direction
then
290
report
(
"
%U : resetting direction from %a, %a
"
,
unicode
,
char
.
direction
,
description
)
291
char
.
direction
=
nil
292
end
293
end
294
if
mirror
then
295
if
mirror
~
=
char
.
mirror
then
296
report
(
"
%U : setting mirror to %a, %a
"
,
unicode
,
mirror
,
description
)
297
char
.
mirror
=
mirror
298
end
299
else
300
if
char
.
mirror
then
301
report
(
"
%U : resetting mirror from %a, %a
"
,
unicode
,
char
.
mirror
,
description
)
302
char
.
mirror
=
nil
303
end
304
end
305
if
linebreak
then
306
if
linebreak
~
=
char
.
linebreak
then
307
report
(
"
%U : setting linebreak to %a, %a
"
,
unicode
,
linebreak
,
description
)
308
char
.
linebreak
=
linebreak
309
end
310
else
311
if
char
.
linebreak
then
312
report
(
"
%U : resetting linebreak from %a, %a
"
,
unicode
,
char
.
linebreak
,
description
)
313
char
.
linebreak
=
nil
314
end
315
end
316
if
cjkwd
then
317
if
cjkwd
~
=
char
.
cjkwd
then
318
report
(
"
%U : setting cjkwd of to %a, %a
"
,
unicode
,
cjkwd
,
description
)
319
char
.
cjkwd
=
cjkwd
320
end
321
else
322
if
char
.
cjkwd
then
323
report
(
"
%U : resetting cjkwd of from %a, %a
"
,
unicode
,
char
.
cjkwd
,
description
)
324
char
.
cjkwd
=
nil
325
end
326
end
327
if
arabic
then
328
if
arabic
~
=
char
.
arabic
then
329
report
(
"
%U : setting arabic to %a, %a
"
,
unicode
,
arabic
,
description
)
330
char
.
arabic
=
arabic
331
end
332
else
333
if
char
.
arabic
then
334
report
(
"
%U : resetting arabic from %a, %a
"
,
unicode
,
char
.
arabic
,
description
)
335
char
.
arabic
=
nil
336
end
337
end
338
if
combining
then
339
if
combining
~
=
char
.
combining
then
340
report
(
"
%U : setting combining to %a, %a
"
,
unicode
,
combining
,
description
)
341
char
.
combining
=
combining
342
end
343
else
344
if
char
.
combining
then
345
report
(
"
%U : resetting combining from %a, %a
"
,
unicode
,
char
.
combining
,
description
)
346
end
347
end
348
if
specials
then
349
if
not
char
.
specials
or
not
are_equal
(
specials
,
char
.
specials
)
then
350
local
t
=
{
specials
[
1
]
}
for
i
=
2
,
#
specials
do
t
[
i
]
=
formatters
[
"
%U
"
]
(
specials
[
i
]
)
end
351
report
(
"
%U : setting specials to % + t, %a
"
,
unicode
,
t
,
description
)
352
char
.
specials
=
specials
353
end
354
else
355
local
specials
=
char
.
specials
356
if
specials
then
357
local
t
=
{
}
for
i
=
2
,
#
specials
do
t
[
i
]
=
formatters
[
"
%U
"
]
(
specials
[
i
]
)
end
358
if
false
then
359
char
.
comment
=
nil
360
report
(
"
%U : resetting specials from % + t, %a
"
,
unicode
,
t
,
description
)
361
else
362
local
comment
=
char
.
comment
363
if
not
comment
then
364
char
.
comment
=
"
check special
"
365
elseif
not
find
(
comment
,
"
check special
"
)
then
366
char
.
comment
=
comment
.
.
"
, check special
"
367
end
368
-- report("%U : check specials % + t, %a",unicode,t,description)
369
end
370
end
371
end
372
end
373
--
374
local
visual
=
char
.
visual
375
if
not
visual
and
find
(
description
,
"
MATH
"
)
then
376
if
find
(
description
,
"
BOLD ITALIC
"
)
then
377
visual
=
"
bi
"
378
elseif
find
(
description
,
"
ITALIC
"
)
then
379
visual
=
"
it
"
380
elseif
find
(
description
,
"
BOLD
"
)
then
381
visual
=
"
bf
"
382
end
383
if
visual
then
384
report
(
"
%U : setting visual to %a, %a
"
,
unicode
,
visual
,
description
)
385
char
.
visual
=
visual
386
end
387
end
388
-- mathextensible
389
if
category
=
=
"
sm
"
or
(
category
=
=
"
so
"
and
char
.
mathclass
)
then
390
local
mathextensible
=
char
.
mathextensible
391
if
mathextensible
then
392
-- already done
393
elseif
find
(
description
,
"
ABOVE
"
)
then
394
-- skip
395
elseif
find
(
description
,
"
ARROWHEAD
"
)
then
396
-- skip
397
elseif
find
(
description
,
"
HALFWIDTH
"
)
then
398
-- skip
399
elseif
find
(
description
,
"
ANGLE
"
)
then
400
-- skip
401
elseif
find
(
description
,
"
THROUGH
"
)
then
402
-- skip
403
elseif
find
(
description
,
"
ARROW
"
)
then
404
-- skip
405
local
u
=
find
(
description
,
"
UP
"
)
406
local
d
=
find
(
description
,
"
DOWN
"
)
407
local
l
=
find
(
description
,
"
LEFT
"
)
408
local
r
=
find
(
description
,
"
RIGHT
"
)
409
if
find
(
description
,
"
ARROWHEAD
"
)
then
410
-- skip
411
elseif
find
(
description
,
"
HALFWIDTH
"
)
then
412
-- skip
413
elseif
u
and
d
then
414
if
l
or
r
then
415
mathextensible
=
'
m
'
-- mixed
416
else
417
mathextensible
=
'
v
'
-- vertical
418
end
419
elseif
u
then
420
if
l
or
r
then
421
mathextensible
=
'
m
'
-- mixed
422
else
423
mathextensible
=
"
u
"
-- up
424
end
425
elseif
d
then
426
if
l
or
r
then
427
mathextensible
=
'
m
'
-- mixed
428
else
429
mathextensible
=
"
d
"
-- down
430
end
431
elseif
l
and
r
then
432
mathextensible
=
"
h
"
-- horizontal
433
elseif
r
then
434
mathextensible
=
"
r
"
-- right
435
elseif
l
then
436
mathextensible
=
"
l
"
-- left
437
end
438
if
mathextensible
then
439
report
(
"
%U : setting mathextensible to %a, %a
"
,
unicode
,
mathextensible
,
description
)
440
char
.
mathextensible
=
mathextensible
441
end
442
end
443
end
444
end
445
end
446
end
447
-- we need the hash .. add missing specials
448
for
unicode
,
data
in
table
.
sortedhash
(
characterdata
)
do
449
if
not
data
.
specials
or
data
.
comment
and
find
(
data
.
comment
,
"
check special
"
)
then
450
local
description
=
data
.
description
451
local
b
,
m
=
match
(
description
,
"
^(.+) WITH (.+)$
"
)
452
if
b
and
m
and
(
find
(
b
,
"
^LATIN
"
)
or
find
(
b
,
"
^CYRILLIC
"
)
)
then
453
local
base
=
descriptions
[
b
]
454
local
mark
=
descriptions
[
m
]
455
if
not
mark
and
m
=
=
"
STROKE
"
then
456
mark
=
descriptions
[
"
SOLIDUS
"
]
-- SLASH
457
end
458
if
base
and
mark
then
459
-- report("adding extra char special for %a",description)
460
data
.
specials
=
{
"
with
"
,
base
,
mark
}
461
data
.
comment
=
nil
462
end
463
end
464
end
465
end
466
--
467
for
i
=
1
,
#
standardizedvariants
do
468
local
si
=
standardizedvariants
[
i
]
469
local
pair
,
addendum
=
si
[
1
]
,
strip
(
si
[
2
]
)
470
local
first
,
second
=
lpegmatch
(
split_space_two
,
pair
)
-- string.splitup(pair," ")
471
first
=
tonumber
(
first
,
16
)
472
second
=
tonumber
(
second
,
16
)
473
if
first
then
474
local
d
=
characterdata
[
first
]
475
if
d
then
476
local
v
=
d
.
variants
477
local
v
=
rawget
(
d
,
"
variants
"
)
478
if
not
v
then
479
v
=
{
}
480
d
.
variants
=
v
481
end
482
if
not
v
[
second
]
then
483
report
(
"
%U : adding variant %U as %s, %a
"
,
first
,
second
,
addendum
,
d
.
description
)
484
v
[
second
]
=
addendum
485
end
486
end
487
end
488
end
489
for
unicode
,
ud
in
table
.
sortedpairs
(
characterdata
)
do
490
if
not
rawget
(
ud
,
"
category
"
)
and
rawget
(
ud
,
"
variants
"
)
then
491
-- report("stripping %U (variant, takes from metacharacter)",unicode)
492
characterdata
[
unicode
]
=
nil
493
end
494
end
495
end
496 497
local
preamble
498 499
local
function
splitdefinition
(
str
,
index
)
500
local
l
=
splitlines
(
str
)
501
local
t
=
{
}
502
if
index
then
503
for
i
=
1
,
#
l
do
504
local
s
=
gsub
(
l
[
i
]
,
"
*#.*$
"
,
"
"
)
505
if
s
~
=
"
"
then
506
local
d
=
lpegmatch
(
split_colon_table
,
s
)
-- split(s,";")
507
local
o
=
d
[
1
]
508
local
u
=
tonumber
(
o
,
16
)
509
if
u
then
510
t
[
u
]
=
d
511
else
512
-- local b, e = match(o,"^([^%.]+)%.%.([^%.]+)$")
513
local
b
,
e
=
lpegmatch
(
split_range_two
,
o
)
514
if
b
and
e
then
515
b
=
tonumber
(
b
,
16
)
516
e
=
tonumber
(
e
,
16
)
517
for
k
=
b
,
e
do
518
t
[
k
]
=
d
519
end
520
else
521
report
(
"
problem: %s
"
,
s
)
522
end
523
end
524
end
525
end
526
else
527
local
n
=
0
528
for
i
=
1
,
#
l
do
529
local
s
=
gsub
(
l
[
i
]
,
"
*#.*$
"
,
"
"
)
530
if
s
~
=
"
"
then
531
n
=
n
+
1
532
t
[
n
]
=
lpegmatch
(
split_colon_table
,
s
)
-- split(s,";")
533
end
534
end
535
end
536
return
t
537
end
538 539
local
function
splitindex
(
str
)
540
-- ok, quick and dirty ... could be a nice lpeg instead
541
local
l
=
splitlines
(
str
)
542
local
n
=
{
}
543
for
i
=
1
,
#
l
do
544
local
a
,
b
,
c
=
match
(
l
[
i
]
,
"
([^%,]+)%,?(.-)\t(.*)
"
)
545
if
a
and
b
and
c
then
546
local
name
=
b
.
.
"
"
.
.
a
547
name
=
strip
(
name
)
548
name
=
gsub
(
name
,
"
%s+
"
,
"
"
)
549
n
[
name
]
=
tonumber
(
c
,
16
)
550
end
551
end
552
return
n
553
end
554 555
function
scripts
.
unicode
.
load
(
)
556
local
fullname
=
resolvers
.
findfile
(
"
char-def.lua
"
)
557
report
(
"
using: %s
"
,
fullname
)
558
local
data
=
io
.
loaddata
(
fullname
)
559
if
data
then
560
loadstring
(
data
)
(
)
561
--
562
local
fullname
=
resolvers
.
findfile
(
"
char-ini.lua
"
)
563
report
(
"
using: %s
"
,
fullname
)
564
dofile
(
fullname
)
565
--
566
local
fullname
=
resolvers
.
findfile
(
"
char-utf.lua
"
)
567
report
(
"
using: %s
"
,
fullname
)
568
dofile
(
fullname
)
569
--
570
local
fullname
=
resolvers
.
findfile
(
"
char-cjk.lua
"
)
571
report
(
"
using: %s
"
,
fullname
)
572
dofile
(
fullname
)
573
--
574
preamble
=
gsub
(
data
,
"
characters%.data%s*=%s*%{.*
"
,
"
"
)
575
--
576
textfiles
=
{
577
unicodedata
=
resolvers
.
findfile
(
"
unicodedata.txt
"
)
or
"
"
,
578
bidimirroring
=
resolvers
.
findfile
(
"
bidimirroring.txt
"
)
or
"
"
,
579
linebreak
=
resolvers
.
findfile
(
"
linebreak.txt
"
)
or
"
"
,
580
eastasianwidth
=
resolvers
.
findfile
(
"
eastasianwidth.txt
"
)
or
"
"
,
581
standardizedvariants
=
resolvers
.
findfile
(
"
standardizedvariants.txt
"
)
or
"
"
,
582
arabicshaping
=
resolvers
.
findfile
(
"
arabicshaping.txt
"
)
or
"
"
,
583
casefolding
=
resolvers
.
findfile
(
"
casefolding.txt
"
)
or
"
"
,
584
index
=
resolvers
.
findfile
(
"
index.txt
"
)
or
"
"
,
585
}
586
--
587
textdata
=
{
588
unicodedata
=
textfiles
.
unicodedata
~
=
"
"
and
io
.
loaddata
(
textfiles
.
unicodedata
)
or
"
"
,
589
bidimirroring
=
textfiles
.
bidimirroring
~
=
"
"
and
io
.
loaddata
(
textfiles
.
bidimirroring
)
or
"
"
,
590
linebreak
=
textfiles
.
linebreak
~
=
"
"
and
io
.
loaddata
(
textfiles
.
linebreak
)
or
"
"
,
591
eastasianwidth
=
textfiles
.
eastasianwidth
~
=
"
"
and
io
.
loaddata
(
textfiles
.
eastasianwidth
)
or
"
"
,
592
standardizedvariants
=
textfiles
.
standardizedvariants
~
=
"
"
and
io
.
loaddata
(
textfiles
.
standardizedvariants
)
or
"
"
,
593
arabicshaping
=
textfiles
.
arabicshaping
~
=
"
"
and
io
.
loaddata
(
textfiles
.
arabicshaping
)
or
"
"
,
594
casefolding
=
textfiles
.
casefolding
~
=
"
"
and
io
.
loaddata
(
textfiles
.
casefolding
)
or
"
"
,
595
index
=
textfiles
.
index
~
=
"
"
and
io
.
loaddata
(
textfiles
.
index
)
or
"
"
,
596
}
597
texttables
=
{
598
unicodedata
=
splitdefinition
(
textdata
.
unicodedata
,
true
)
,
599
bidimirroring
=
splitdefinition
(
textdata
.
bidimirroring
,
true
)
,
600
linebreak
=
splitdefinition
(
textdata
.
linebreak
,
true
)
,
601
eastasianwidth
=
splitdefinition
(
textdata
.
eastasianwidth
,
true
)
,
602
standardizedvariants
=
splitdefinition
(
textdata
.
standardizedvariants
,
false
)
,
603
arabicshaping
=
splitdefinition
(
textdata
.
arabicshaping
,
true
)
,
604
casefolding
=
splitdefinition
(
textdata
.
casefolding
,
true
)
,
605
index
=
splitindex
(
textdata
.
index
)
,
606
}
607
--
608
for
k
,
v
in
table
.
sortedhash
(
textfiles
)
do
609
report
(
"
using: %s
"
,
v
)
610
end
611
return
true
612
else
613
preamble
=
nil
614
return
false
615
end
616
end
617 618
-- local variants_emoji={
619
-- [0xFE0E]="text style",
620
-- [0xFE0F]="emoji style",
621
-- }
622
--
623
-- local variants_forms={
624
-- [0xFE00]="corner-justified form",
625
-- [0xFE01]="centered form",
626
-- }
627 628
-- local variants_style={
629
-- [0xFE00]="chancery style",
630
-- [0xFE01]="roundhand style",
631
-- }
632 633
function
scripts
.
unicode
.
save
(
filename
)
634
if
preamble
then
635
local
data
=
table
.
serialize
(
characters
.
data
,
"
characters.data
"
,
{
hexify
=
true
,
noquotes
=
true
}
)
636
data
=
gsub
(
data
,
"
%{%s+%[0xFE0E%]=\"text style\",%s+%[0xFE0F%]=\"emoji style\",%s+%}
"
,
"
variants_emoji
"
)
637
data
=
gsub
(
data
,
"
%{%s+%[0xFE00%]=\"corner%-justified form\",%s+%[0xFE01%]=\"centered form\",%s+%}
"
,
"
variants_forms
"
)
638
data
=
gsub
(
data
,
"
%{%s+%[0xFE00%]=\"chancery style\",%s+%[0xFE01%]=\"roundhand style\",%s+%}
"
,
"
variants_style
"
)
639
data
=
gsub
(
data
,
"
%{%s+%[0xFE00%]=\"dotted form\",%s+%}
"
,
"
variants_dotted
"
)
640
io
.
savedata
(
filename
,
preamble
.
.
data
)
641
end
642
end
643 644
function
scripts
.
unicode
.
extras
(
)
-- old code
645
--
646
-- 0000..007F; Basic Latin
647
-- 0080..00FF; Latin-1 Supplement
648
-- 0100..017F; Latin Extended-A
649
--
650
local
fullname
=
resolvers
.
findfile
(
"
blocks.txt
"
)
or
"
"
651
if
fullname
~
=
"
"
then
652
local
data
=
io
.
loaddata
(
fullname
)
653
local
lines
=
splitlines
(
data
)
654
local
map
=
{
}
655
local
blocks
=
characters
.
blocks
656
local
result
=
{
}
657
for
i
=
1
,
#
lines
do
658
local
line
=
gsub
(
lines
[
i
]
,
"
*#.*$
"
,
"
"
)
659
if
line
~
=
"
"
then
660
local
specification
=
lpegmatch
(
split_colon_table
,
line
)
-- split(s,";")
661
local
range
=
specification
[
1
]
662
local
description
=
specification
[
2
]
663
if
range
and
description
then
664
local
start
,
stop
=
lpegmatch
(
split_range_two
,
range
)
665
if
start
and
stop
then
666
local
start
=
tonumber
(
start
,
16
)
667
local
stop
=
tonumber
(
stop
,
16
)
668
local
name
=
gsub
(
lower
(
description
)
,
"
[^a-z]+
"
,
"
"
)
669
if
start
and
stop
then
670
local
b
=
blocks
[
name
]
671
if
not
b
then
672
result
[
#
result
+
1
]
=
formatters
[
[[
+ block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }
]]
]
(
name
,
start
,
stop
,
description
)
673
blocks
[
name
]
=
{
first
=
start
,
last
=
stop
,
description
=
description
}
674
elseif
b
.
first
~
=
start
or
b
.
last
~
=
stop
or
b
.
description
~
=
description
then
675
result
[
#
result
+
1
]
=
formatters
[
[[
? block: ["%s"] = { first = 0x%05X, last = 0x%05X, description = "%S" }
]]
]
(
name
,
start
,
stop
,
description
)
676
end
677
end
678
map
[
#
map
+
1
]
=
name
679
end
680
end
681
end
682
end
683
sort
(
result
)
684
for
i
=
1
,
#
result
do
685
report
(
result
[
i
]
)
686
end
687
sort
(
map
)
688
for
i
=
1
,
#
map
do
689
local
m
=
map
[
i
]
690
if
not
blocks
[
m
]
then
691
report
(
"
obsolete block %a
"
,
m
)
692
end
693
end
694
end
695
--
696
local
index
=
texttables
.
index
697
local
blocks
=
characters
.
blocks
698
local
data
=
characters
.
data
699
for
k
,
v
in
next
,
index
do
700
if
k
~
=
lower
(
k
)
then
701
index
[
k
]
=
nil
702
end
703
end
704
-- for k, v in next, data do
705
-- v.synonym = nil
706
-- v.synonyms = nil
707
-- end
708
for
k
,
v
in
table
.
sortedhash
(
index
)
do
709
local
d
=
data
[
v
]
710
if
d
and
d
.
description
~
=
upper
(
k
)
then
711
local
synonyms
=
d
.
synonyms
712
if
synonyms
then
713
local
n
=
#
synonyms
714
local
f
=
false
715
for
i
=
1
,
n
do
716
if
synonyms
[
i
]
=
=
k
then
717
f
=
true
718
break
719
end
720
end
721
if
not
f
then
722
synonyms
[
n
+
1
]
=
k
723
end
724
-- synonyms = table.unique(synonyms)
725
-- d.synonyms = synonyms
726
sort
(
synonyms
)
727
else
728
d
.
synonyms
=
{
k
}
729
end
730
end
731
end
732
end
733 734
do
735 736
local
space
=
P
(
"
"
)
737
local
spaces
=
space
^
0
738
local
semicolon
=
P
(
"
;
"
)
739
local
hash
=
P
(
"
#
"
)
740
local
newline
=
S
(
"
\n\r
"
)
741 742
local
unicode
=
Cs
(
R
(
"
09
"
,
"
AF
"
)
^
1
)
/
function
(
n
)
return
tonumber
(
n
,
16
)
end
743
*
spaces
744
local
components
=
Ct
(
unicode
^
1
)
745 746
-- local rubish_a = semicolon
747
-- * spaces
748
-- * P("Emoji_ZWJ_Sequence")
749
-- * spaces
750
-- * semicolon
751
-- * spaces
752
-- local description = C((1 - (spaces * (hash+newline)))^1)
753
-- local rubish_b = (1-newline)^0
754
-- * newline^1
755
--
756
-- local pattern_1 = Ct ( (
757
-- Cf ( Ct("") *
758
-- Cg (Cc("components") * components)
759
-- * rubish_a
760
-- * Cg (Cc("description") * description )
761
-- * rubish_b
762
-- , rawset)
763
-- + P(1) )^1 )
764 765
local
rubish_a
=
semicolon
766
*
spaces
767
*
P
(
"
non-
"
)
^
0
*
P
(
"
fully-qualified
"
)
768
*
spaces
769
*
hash
770
*
spaces
771
local
textstring
=
C
(
(
1
-
space
)
^
1
)
772
*
spaces
773
local
description
=
(
(
1
-
(
spaces
*
newline
)
)
^
1
)
/
string
.
lower
774
local
rubish_b
=
(
1
-
newline
)
^
0
775
*
newline
^
1
776 777
local
pattern_2
=
Ct
(
(
778
Cf
(
Ct
(
"
"
)
*
779
Cg
(
Cc
(
"
components
"
)
*
components
)
780
*
rubish_a
781
*
Cg
(
Cc
(
"
textstring
"
)
*
textstring
)
782
*
Cg
(
Cc
(
"
description
"
)
*
description
)
783
*
rubish_b
784
,
rawset
)
785
+
P
(
1
)
)
^
1
)
786 787
function
scripts
.
unicode
.
emoji
(
filename
)
788 789
local
name
=
resolvers
.
findfile
(
"
emoji-test.txt
"
)
or
"
"
790
if
name
=
=
"
"
then
791
return
792
end
793
local
l
=
io
.
loaddata
(
name
)
794
local
t
=
lpegmatch
(
pattern_2
,
l
)
795 796
local
hash
=
{
}
797 798
local
crap
=
lpeg
.
P
(
"
e
"
)
*
lpeg
.
R
(
"
09
"
,
"
..
"
,
"
09
"
)
^
1
*
lpeg
.
P
(
"
"
)
^
1
799 800
local
replace
=
lpeg
.
replacer
{
801
[
crap
]
=
"
"
,
802
[
"
#
"
]
=
"
hash
"
,
803
[
"
*
"
]
=
"
asterisk
"
,
804
}
805 806
for
i
=
1
,
#
t
do
807
local
v
=
t
[
i
]
808
local
d
=
v
.
description
809
local
k
=
lpegmatch
(
replace
,
d
)
or
d
810
hash
[
k
]
=
v
.
components
811
end
812
local
new
=
table
.
serialize
(
hash
,
"
return
"
,
{
hexify
=
true
}
)
813
local
old
=
io
.
loaddata
(
resolvers
.
findfile
(
"
char-emj.lua
"
)
)
814
if
old
and
old
~
=
"
"
then
815
new
=
gsub
(
old
,
"
^(.-)return .*$
"
,
"
%1
"
.
.
new
)
816
end
817
io
.
savedata
(
filename
,
new
)
818
end
819 820
end
821 822
-- the action
823 824
local
filename
=
environment
.
files
[
1
]
825 826
if
environment
.
arguments
.
exporthelp
then
827
application
.
export
(
environment
.
arguments
.
exporthelp
,
filename
)
828
else
829
report
(
"
start working on %a, input char-def.lua
"
,
lfs
.
currentdir
(
)
)
830
if
scripts
.
unicode
.
load
(
)
then
831
scripts
.
unicode
.
update
(
)
832
scripts
.
unicode
.
extras
(
)
833
scripts
.
unicode
.
save
(
"
char-def-new.lua
"
)
834
scripts
.
unicode
.
emoji
(
"
char-emj-new.lua
"
)
835
report
(
"
saved file %a
"
,
"
char-def-new.lua
"
)
836
report
(
"
saved file %a (current 14.0, check for updates, see above!)
"
,
"
char-emj-new.lua
"
)
837
else
838
report
(
"
nothing to do
"
)
839
end
840
report
(
"
stop working on %a\n
"
,
lfs
.
currentdir
(
)
)
841
end
842