char-ini.lua /size: 91 Kb    last modification: 2020-07-01 14:35
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
char-ini
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to char-ini.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
-- todo: make two files, one for format generation, one for format use
10
-- todo: move some to char-utf
11 12
-- we can remove the tag range starting at 0xE0000 (special applications)
13 14
local
utfchar
,
utfbyte
,
utfvalues
,
ustring
,
utotable
=
utf
.
char
,
utf
.
byte
,
utf
.
values
,
utf
.
ustring
,
utf
.
totable
15
local
concat
,
unpack
,
tohash
,
insert
=
table
.
concat
,
table
.
unpack
,
table
.
tohash
,
table
.
insert
16
local
next
,
tonumber
,
type
,
rawget
,
rawset
=
next
,
tonumber
,
type
,
rawget
,
rawset
17
local
format
,
lower
,
gsub
,
find
=
string
.
format
,
string
.
lower
,
string
.
gsub
,
string
.
find
18
local
P
,
R
,
S
,
C
,
Cs
,
Ct
,
Cc
,
V
=
lpeg
.
P
,
lpeg
.
R
,
lpeg
.
S
,
lpeg
.
C
,
lpeg
.
Cs
,
lpeg
.
Ct
,
lpeg
.
Cc
,
lpeg
.
V
19
local
formatters
=
string
.
formatters
20 21
if
not
characters
then
require
(
"
char-def
"
)
end
22 23
local
lpegpatterns
=
lpeg
.
patterns
24
local
lpegmatch
=
lpeg
.
match
25
local
utf8byte
=
lpegpatterns
.
utf8byte
26
local
utf8character
=
lpegpatterns
.
utf8character
27 28
local
utfchartabletopattern
=
lpeg
.
utfchartabletopattern
29 30
local
allocate
=
utilities
.
storage
.
allocate
31
local
mark
=
utilities
.
storage
.
mark
32 33
local
setmetatableindex
=
table
.
setmetatableindex
34 35
local
trace_defining
=
false
trackers
.
register
(
"
characters.defining
"
,
function
(
v
)
characters_defining
=
v
end
)
36 37
local
report_defining
=
logs
.
reporter
(
"
characters
"
)
38 39
--[[ldx-- 40<p>This module implements some methods and creates additional datastructured 41from the big character table that we use for all kind of purposes: 42<type>char-def.lua</type>.</p> 43 44<p>We assume that at this point <type>characters.data</type> is already 45loaded!</p> 46--ldx]]
--
47 48
-- todo: in 'char-def.lua' assume defaults:
49
--
50
-- directtions = l
51
-- cjkwd = a
52
-- linebreak = al
53 54
characters
=
characters
or
{
}
55
local
characters
=
characters
56
local
data
=
characters
.
data
57 58
if
data
then
59
mark
(
data
)
-- why does this fail
60
else
61
report_defining
(
"
fatal error: 'char-def.lua' is not loaded
"
)
62
os
.
exit
(
)
63
end
64 65
--[[ldx-- 66Extending the table. 67--ldx]]
--
68 69
if
context
and
not
characters
.
private
then
70 71
require
(
"
char-prv
"
)
72 73
for
unicode
,
d
in
next
,
characters
.
private
do
74
data
[
unicode
]
=
d
75
end
76 77
end
78 79
--[[ldx-- 80<p>This converts a string (if given) into a number.</p> 81--ldx]]
--
82 83
local
pattern
=
(
P
(
"
0x
"
)
+
P
(
"
U+
"
)
)
*
(
(
R
(
"
09
"
,
"
AF
"
)
^
1
*
P
(
-1
)
)
/
function
(
s
)
return
tonumber
(
s
,
16
)
end
)
84 85
lpegpatterns
.
chartonumber
=
pattern
86 87
local
function
chartonumber
(
k
)
88
if
type
(
k
)
=
=
"
string
"
then
89
local
u
=
lpegmatch
(
pattern
,
k
)
90
if
u
then
91
return
utfbyte
(
u
)
92
else
93
return
utfbyte
(
k
)
or
0
94
end
95
else
96
return
k
or
0
97
end
98
end
99 100
local
function
charfromnumber
(
k
)
101
if
type
(
k
)
=
=
"
number
"
then
102
return
utfchar
(
k
)
or
"
"
103
else
104
local
u
=
lpegmatch
(
pattern
,
k
)
105
if
u
then
106
return
utfchar
(
u
)
107
else
108
return
k
109
end
110
end
111
end
112 113
--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
114 115
characters
.
tonumber
=
chartonumber
116
characters
.
fromnumber
=
charfromnumber
117 118
local
private
=
{
119
description
=
"
PRIVATE SLOT
"
,
120
}
121 122
local
ranges
=
allocate
(
)
123
characters
.
ranges
=
ranges
124 125
setmetatableindex
(
data
,
function
(
t
,
k
)
126
local
tk
=
type
(
k
)
127
if
tk
=
=
"
string
"
then
128
k
=
lpegmatch
(
pattern
,
k
)
or
utfbyte
(
k
)
129
if
k
then
130
local
v
=
rawget
(
t
,
k
)
131
if
v
then
132
return
v
133
else
134
tk
=
"
number
"
-- fall through to range
135
end
136
else
137
return
private
138
end
139
end
140
if
tk
=
=
"
number
"
and
k
<
0xF0000
then
141
for
r
=
1
,
#
ranges
do
142
local
rr
=
ranges
[
r
]
143
if
k
>
=
rr
.
first
and
k
<
=
rr
.
last
then
144
local
extender
=
rr
.
extender
145
if
extender
then
146
local
v
=
extender
(
k
)
147
t
[
k
]
=
v
148
return
v
149
end
150
end
151
end
152
end
153
return
private
-- handy for when we loop over characters in fonts and check for a property
154
end
)
155 156
local
variant_selector_metatable
=
{
157
category
=
"
mn
"
,
158
cjkwd
=
"
a
"
,
159
direction
=
"
nsm
"
,
160
linebreak
=
"
cm
"
,
161
}
162 163
-- This saves a bit of memory and also serves as example.
164 165
local
f_variant
=
string
.
formatters
[
"
VARIATION SELECTOR-0x%04X
"
]
166 167
insert
(
characters
.
ranges
,
{
168
first
=
0xFE00
,
169
last
=
0xFE0F
,
170
name
=
"
variant selector
"
,
171
extender
=
function
(
k
)
172
local
t
=
{
173
description
=
f_variant
(
k
-
0xFE00
+
0x0001
)
,
174
unicodeslot
=
k
,
175
}
176
setmetatable
(
t
,
variant_selector_metatable
)
177
return
t
178
end
,
179
}
)
180 181
insert
(
characters
.
ranges
,
{
182
first
=
0xE0100
,
183
last
=
0xE01EF
,
184
name
=
"
variant selector extension
"
,
185
extender
=
function
(
k
)
186
local
t
=
{
187
description
=
f_variant
(
k
-
0xE0100
+
0x0011
)
,
188
unicodeslot
=
k
,
189
}
190
setmetatable
(
t
,
variant_selector_metatable
)
191
return
t
192
end
,
193
}
)
194 195
local
blocks
=
allocate
{
196
[
"
adlam
"
]
=
{
first
=
0x1E900
,
last
=
0x1E95F
,
description
=
"
Adlam
"
}
,
197
[
"
aegeannumbers
"
]
=
{
first
=
0x10100
,
last
=
0x1013F
,
description
=
"
Aegean Numbers
"
}
,
198
[
"
ahom
"
]
=
{
first
=
0x11700
,
last
=
0x1173F
,
description
=
"
Ahom
"
}
,
199
[
"
alchemicalsymbols
"
]
=
{
first
=
0x1F700
,
last
=
0x1F77F
,
description
=
"
Alchemical Symbols
"
}
,
200
[
"
alphabeticpresentationforms
"
]
=
{
first
=
0x0FB00
,
last
=
0x0FB4F
,
otf
=
"
latn
"
,
description
=
"
Alphabetic Presentation Forms
"
}
,
201
[
"
anatolianhieroglyphs
"
]
=
{
first
=
0x14400
,
last
=
0x1467F
,
description
=
"
Anatolian Hieroglyphs
"
}
,
202
[
"
ancientgreekmusicalnotation
"
]
=
{
first
=
0x1D200
,
last
=
0x1D24F
,
otf
=
"
grek
"
,
description
=
"
Ancient Greek Musical Notation
"
}
,
203
[
"
ancientgreeknumbers
"
]
=
{
first
=
0x10140
,
last
=
0x1018F
,
otf
=
"
grek
"
,
description
=
"
Ancient Greek Numbers
"
}
,
204
[
"
ancientsymbols
"
]
=
{
first
=
0x10190
,
last
=
0x101CF
,
otf
=
"
grek
"
,
description
=
"
Ancient Symbols
"
}
,
205
[
"
arabic
"
]
=
{
first
=
0x00600
,
last
=
0x006FF
,
otf
=
"
arab
"
,
description
=
"
Arabic
"
}
,
206
[
"
arabicextendeda
"
]
=
{
first
=
0x008A0
,
last
=
0x008FF
,
description
=
"
Arabic Extended-A
"
}
,
207
[
"
arabicmathematicalalphabeticsymbols
"
]
=
{
first
=
0x1EE00
,
last
=
0x1EEFF
,
description
=
"
Arabic Mathematical Alphabetic Symbols
"
}
,
208
[
"
arabicpresentationformsa
"
]
=
{
first
=
0x0FB50
,
last
=
0x0FDFF
,
otf
=
"
arab
"
,
description
=
"
Arabic Presentation Forms-A
"
}
,
209
[
"
arabicpresentationformsb
"
]
=
{
first
=
0x0FE70
,
last
=
0x0FEFF
,
otf
=
"
arab
"
,
description
=
"
Arabic Presentation Forms-B
"
}
,
210
[
"
arabicsupplement
"
]
=
{
first
=
0x00750
,
last
=
0x0077F
,
otf
=
"
arab
"
,
description
=
"
Arabic Supplement
"
}
,
211
[
"
armenian
"
]
=
{
first
=
0x00530
,
last
=
0x0058F
,
otf
=
"
armn
"
,
description
=
"
Armenian
"
}
,
212
[
"
arrows
"
]
=
{
first
=
0x02190
,
last
=
0x021FF
,
description
=
"
Arrows
"
}
,
213
[
"
avestan
"
]
=
{
first
=
0x10B00
,
last
=
0x10B3F
,
description
=
"
Avestan
"
}
,
214
[
"
balinese
"
]
=
{
first
=
0x01B00
,
last
=
0x01B7F
,
otf
=
"
bali
"
,
description
=
"
Balinese
"
}
,
215
[
"
bamum
"
]
=
{
first
=
0x0A6A0
,
last
=
0x0A6FF
,
description
=
"
Bamum
"
}
,
216
[
"
bamumsupplement
"
]
=
{
first
=
0x16800
,
last
=
0x16A3F
,
description
=
"
Bamum Supplement
"
}
,
217
[
"
basiclatin
"
]
=
{
first
=
0x00000
,
last
=
0x0007F
,
otf
=
"
latn
"
,
description
=
"
Basic Latin
"
}
,
218
[
"
bassavah
"
]
=
{
first
=
0x16AD0
,
last
=
0x16AFF
,
description
=
"
Bassa Vah
"
}
,
219
[
"
batak
"
]
=
{
first
=
0x01BC0
,
last
=
0x01BFF
,
description
=
"
Batak
"
}
,
220
[
"
bengali
"
]
=
{
first
=
0x00980
,
last
=
0x009FF
,
otf
=
"
beng
"
,
description
=
"
Bengali
"
}
,
221
[
"
bhaiksuki
"
]
=
{
first
=
0x11C00
,
last
=
0x11C6F
,
description
=
"
Bhaiksuki
"
}
,
222
[
"
blockelements
"
]
=
{
first
=
0x02580
,
last
=
0x0259F
,
otf
=
"
bopo
"
,
description
=
"
Block Elements
"
}
,
223
[
"
bopomofo
"
]
=
{
first
=
0x03100
,
last
=
0x0312F
,
otf
=
"
bopo
"
,
description
=
"
Bopomofo
"
}
,
224
[
"
bopomofoextended
"
]
=
{
first
=
0x031A0
,
last
=
0x031BF
,
otf
=
"
bopo
"
,
description
=
"
Bopomofo Extended
"
}
,
225
[
"
boxdrawing
"
]
=
{
first
=
0x02500
,
last
=
0x0257F
,
description
=
"
Box Drawing
"
}
,
226
[
"
brahmi
"
]
=
{
first
=
0x11000
,
last
=
0x1107F
,
description
=
"
Brahmi
"
}
,
227
[
"
braillepatterns
"
]
=
{
first
=
0x02800
,
last
=
0x028FF
,
otf
=
"
brai
"
,
description
=
"
Braille Patterns
"
}
,
228
[
"
buginese
"
]
=
{
first
=
0x01A00
,
last
=
0x01A1F
,
otf
=
"
bugi
"
,
description
=
"
Buginese
"
}
,
229
[
"
buhid
"
]
=
{
first
=
0x01740
,
last
=
0x0175F
,
otf
=
"
buhd
"
,
description
=
"
Buhid
"
}
,
230
[
"
byzantinemusicalsymbols
"
]
=
{
first
=
0x1D000
,
last
=
0x1D0FF
,
otf
=
"
byzm
"
,
description
=
"
Byzantine Musical Symbols
"
}
,
231
[
"
carian
"
]
=
{
first
=
0x102A0
,
last
=
0x102DF
,
description
=
"
Carian
"
}
,
232
[
"
caucasianalbanian
"
]
=
{
first
=
0x10530
,
last
=
0x1056F
,
description
=
"
Caucasian Albanian
"
}
,
233
[
"
chakma
"
]
=
{
first
=
0x11100
,
last
=
0x1114F
,
description
=
"
Chakma
"
}
,
234
[
"
cham
"
]
=
{
first
=
0x0AA00
,
last
=
0x0AA5F
,
description
=
"
Cham
"
}
,
235
[
"
cherokee
"
]
=
{
first
=
0x013A0
,
last
=
0x013FF
,
otf
=
"
cher
"
,
description
=
"
Cherokee
"
}
,
236
[
"
cherokeesupplement
"
]
=
{
first
=
0x0AB70
,
last
=
0x0ABBF
,
description
=
"
Cherokee Supplement
"
}
,
237
[
"
chesssymbols
"
]
=
{
first
=
0x1FA00
,
last
=
0x1FA6F
,
description
=
"
Chess Symbols
"
}
,
238
[
"
chorasmian
"
]
=
{
first
=
0x10FB0
,
last
=
0x10FDF
,
description
=
"
Chorasmian
"
}
,
239
[
"
cjkcompatibility
"
]
=
{
first
=
0x03300
,
last
=
0x033FF
,
otf
=
"
hang
"
,
description
=
"
CJK Compatibility
"
}
,
240
[
"
cjkcompatibilityforms
"
]
=
{
first
=
0x0FE30
,
last
=
0x0FE4F
,
otf
=
"
hang
"
,
description
=
"
CJK Compatibility Forms
"
}
,
241
[
"
cjkcompatibilityideographs
"
]
=
{
first
=
0x0F900
,
last
=
0x0FAFF
,
otf
=
"
hang
"
,
description
=
"
CJK Compatibility Ideographs
"
}
,
242
[
"
cjkcompatibilityideographssupplement
"
]
=
{
first
=
0x2F800
,
last
=
0x2FA1F
,
otf
=
"
hang
"
,
description
=
"
CJK Compatibility Ideographs Supplement
"
}
,
243
[
"
cjkradicalssupplement
"
]
=
{
first
=
0x02E80
,
last
=
0x02EFF
,
otf
=
"
hang
"
,
description
=
"
CJK Radicals Supplement
"
}
,
244
[
"
cjkstrokes
"
]
=
{
first
=
0x031C0
,
last
=
0x031EF
,
otf
=
"
hang
"
,
description
=
"
CJK Strokes
"
}
,
245
[
"
cjksymbolsandpunctuation
"
]
=
{
first
=
0x03000
,
last
=
0x0303F
,
otf
=
"
hang
"
,
description
=
"
CJK Symbols and Punctuation
"
}
,
246
[
"
cjkunifiedideographs
"
]
=
{
first
=
0x04E00
,
last
=
0x09FFF
,
otf
=
"
hang
"
,
description
=
"
CJK Unified Ideographs
"
,
catcode
=
"
letter
"
}
,
247
[
"
cjkunifiedideographsextensiona
"
]
=
{
first
=
0x03400
,
last
=
0x04DBF
,
otf
=
"
hang
"
,
description
=
"
CJK Unified Ideographs Extension A
"
}
,
248
[
"
cjkunifiedideographsextensionb
"
]
=
{
first
=
0x20000
,
last
=
0x2A6DF
,
otf
=
"
hang
"
,
description
=
"
CJK Unified Ideographs Extension B
"
}
,
249
[
"
cjkunifiedideographsextensionc
"
]
=
{
first
=
0x2A700
,
last
=
0x2B73F
,
description
=
"
CJK Unified Ideographs Extension C
"
}
,
250
[
"
cjkunifiedideographsextensiond
"
]
=
{
first
=
0x2B740
,
last
=
0x2B81F
,
description
=
"
CJK Unified Ideographs Extension D
"
}
,
251
[
"
cjkunifiedideographsextensione
"
]
=
{
first
=
0x2B820
,
last
=
0x2CEAF
,
description
=
"
CJK Unified Ideographs Extension E
"
}
,
252
[
"
cjkunifiedideographsextensionf
"
]
=
{
first
=
0x2CEB0
,
last
=
0x2EBEF
,
description
=
"
CJK Unified Ideographs Extension F
"
}
,
253
[
"
cjkunifiedideographsextensiong
"
]
=
{
first
=
0x30000
,
last
=
0x3134F
,
description
=
"
CJK Unified Ideographs Extension G
"
}
,
254
[
"
combiningdiacriticalmarks
"
]
=
{
first
=
0x00300
,
last
=
0x0036F
,
description
=
"
Combining Diacritical Marks
"
}
,
255
[
"
combiningdiacriticalmarksextended
"
]
=
{
first
=
0x01AB0
,
last
=
0x01AFF
,
description
=
"
Combining Diacritical Marks Extended
"
}
,
256
[
"
combiningdiacriticalmarksforsymbols
"
]
=
{
first
=
0x020D0
,
last
=
0x020FF
,
description
=
"
Combining Diacritical Marks for Symbols
"
}
,
257
[
"
combiningdiacriticalmarkssupplement
"
]
=
{
first
=
0x01DC0
,
last
=
0x01DFF
,
description
=
"
Combining Diacritical Marks Supplement
"
}
,
258
[
"
combininghalfmarks
"
]
=
{
first
=
0x0FE20
,
last
=
0x0FE2F
,
description
=
"
Combining Half Marks
"
}
,
259
[
"
commonindicnumberforms
"
]
=
{
first
=
0x0A830
,
last
=
0x0A83F
,
description
=
"
Common Indic Number Forms
"
}
,
260
[
"
controlpictures
"
]
=
{
first
=
0x02400
,
last
=
0x0243F
,
description
=
"
Control Pictures
"
}
,
261
[
"
coptic
"
]
=
{
first
=
0x02C80
,
last
=
0x02CFF
,
otf
=
"
copt
"
,
description
=
"
Coptic
"
}
,
262
[
"
copticepactnumbers
"
]
=
{
first
=
0x102E0
,
last
=
0x102FF
,
description
=
"
Coptic Epact Numbers
"
}
,
263
[
"
countingrodnumerals
"
]
=
{
first
=
0x1D360
,
last
=
0x1D37F
,
description
=
"
Counting Rod Numerals
"
}
,
264
[
"
cuneiform
"
]
=
{
first
=
0x12000
,
last
=
0x123FF
,
otf
=
"
xsux
"
,
description
=
"
Cuneiform
"
}
,
265
[
"
cuneiformnumbersandpunctuation
"
]
=
{
first
=
0x12400
,
last
=
0x1247F
,
otf
=
"
xsux
"
,
description
=
"
Cuneiform Numbers and Punctuation
"
}
,
266
[
"
currencysymbols
"
]
=
{
first
=
0x020A0
,
last
=
0x020CF
,
description
=
"
Currency Symbols
"
}
,
267
[
"
cypriotsyllabary
"
]
=
{
first
=
0x10800
,
last
=
0x1083F
,
otf
=
"
cprt
"
,
description
=
"
Cypriot Syllabary
"
}
,
268
[
"
cyrillic
"
]
=
{
first
=
0x00400
,
last
=
0x004FF
,
otf
=
"
cyrl
"
,
description
=
"
Cyrillic
"
}
,
269
[
"
cyrillicextendeda
"
]
=
{
first
=
0x02DE0
,
last
=
0x02DFF
,
otf
=
"
cyrl
"
,
description
=
"
Cyrillic Extended-A
"
}
,
270
[
"
cyrillicextendedb
"
]
=
{
first
=
0x0A640
,
last
=
0x0A69F
,
otf
=
"
cyrl
"
,
description
=
"
Cyrillic Extended-B
"
}
,
271
[
"
cyrillicextendedc
"
]
=
{
first
=
0x01C80
,
last
=
0x01C8F
,
description
=
"
Cyrillic Extended-C
"
}
,
272
[
"
cyrillicsupplement
"
]
=
{
first
=
0x00500
,
last
=
0x0052F
,
otf
=
"
cyrl
"
,
description
=
"
Cyrillic Supplement
"
}
,
273
[
"
deseret
"
]
=
{
first
=
0x10400
,
last
=
0x1044F
,
otf
=
"
dsrt
"
,
description
=
"
Deseret
"
}
,
274
[
"
devanagari
"
]
=
{
first
=
0x00900
,
last
=
0x0097F
,
otf
=
"
deva
"
,
description
=
"
Devanagari
"
}
,
275
[
"
devanagariextended
"
]
=
{
first
=
0x0A8E0
,
last
=
0x0A8FF
,
description
=
"
Devanagari Extended
"
}
,
276
[
"
digitsarabicindic
"
]
=
{
first
=
0x00660
,
last
=
0x00669
,
math
=
true
}
,
277
-- ["digitsbengali"] = { first = 0x009E6, last = 0x009EF, math = true },
278
[
"
digitsbold
"
]
=
{
first
=
0x1D7CE
,
last
=
0x1D7D8
,
math
=
true
}
,
279
-- ["digitsdevanagari"] = { first = 0x00966, last = 0x0096F, math = true },
280
[
"
digitsdoublestruck
"
]
=
{
first
=
0x1D7D8
,
last
=
0x1D7E2
,
math
=
true
}
,
281
-- ["digitsethiopic"] = { first = 0x01369, last = 0x01371, math = true },
282
[
"
digitsextendedarabicindic
"
]
=
{
first
=
0x006F0
,
last
=
0x006F9
,
math
=
true
}
,
283
-- ["digitsgujarati"] = { first = 0x00AE6, last = 0x00AEF, math = true },
284
-- ["digitsgurmukhi"] = { first = 0x00A66, last = 0x00A6F, math = true },
285
-- ["digitskannada"] = { first = 0x00CE6, last = 0x00CEF, math = true },
286
-- ["digitskhmer"] = { first = 0x017E0, last = 0x017E9, math = true },
287
-- ["digitslao"] = { first = 0x00ED0, last = 0x00ED9, math = true },
288
[
"
digitslatin
"
]
=
{
first
=
0x00030
,
last
=
0x00039
,
math
=
true
}
,
289
-- ["digitsmalayalam"] = { first = 0x00D66, last = 0x00D6F, math = true },
290
-- ["digitsmongolian"] = { first = 0x01810, last = 0x01809, math = true },
291
[
"
digitsmonospace
"
]
=
{
first
=
0x1D7F6
,
last
=
0x1D80F
,
math
=
true
}
,
292
-- ["digitsmyanmar"] = { first = 0x01040, last = 0x01049, math = true },
293
[
"
digitsnormal
"
]
=
{
first
=
0x00030
,
last
=
0x00039
,
math
=
true
}
,
294
-- ["digitsoriya"] = { first = 0x00B66, last = 0x00B6F, math = true },
295
[
"
digitssansserifbold
"
]
=
{
first
=
0x1D7EC
,
last
=
0x1D805
,
math
=
true
}
,
296
[
"
digitssansserifnormal
"
]
=
{
first
=
0x1D7E2
,
last
=
0x1D7EC
,
math
=
true
}
,
297
-- ["digitstamil"] = { first = 0x00030, last = 0x00039, math = true }, -- no zero
298
-- ["digitstelugu"] = { first = 0x00C66, last = 0x00C6F, math = true },
299
-- ["digitsthai"] = { first = 0x00E50, last = 0x00E59, math = true },
300
-- ["digitstibetan"] = { first = 0x00F20, last = 0x00F29, math = true },
301
[
"
dingbats
"
]
=
{
first
=
0x02700
,
last
=
0x027BF
,
description
=
"
Dingbats
"
}
,
302
[
"
divesakuru
"
]
=
{
first
=
0x11900
,
last
=
0x1195F
,
description
=
"
Dives Akuru
"
}
,
303
[
"
dogra
"
]
=
{
first
=
0x11800
,
last
=
0x1184F
,
description
=
"
Dogra
"
}
,
304
[
"
dominotiles
"
]
=
{
first
=
0x1F030
,
last
=
0x1F09F
,
description
=
"
Domino Tiles
"
}
,
305
[
"
duployan
"
]
=
{
first
=
0x1BC00
,
last
=
0x1BC9F
,
description
=
"
Duployan
"
}
,
306
[
"
earlydynasticcuneiform
"
]
=
{
first
=
0x12480
,
last
=
0x1254F
,
description
=
"
Early Dynastic Cuneiform
"
}
,
307
[
"
egyptianhieroglyphformatcontrols
"
]
=
{
first
=
0x13430
,
last
=
0x1343F
,
description
=
"
Egyptian Hieroglyph Format Controls
"
}
,
308
[
"
egyptianhieroglyphs
"
]
=
{
first
=
0x13000
,
last
=
0x1342F
,
description
=
"
Egyptian Hieroglyphs
"
}
,
309
[
"
elbasan
"
]
=
{
first
=
0x10500
,
last
=
0x1052F
,
description
=
"
Elbasan
"
}
,
310
[
"
elymaic
"
]
=
{
first
=
0x10FE0
,
last
=
0x10FFF
,
description
=
"
Elymaic
"
}
,
311
[
"
emoticons
"
]
=
{
first
=
0x1F600
,
last
=
0x1F64F
,
description
=
"
Emoticons
"
}
,
312
[
"
enclosedalphanumerics
"
]
=
{
first
=
0x02460
,
last
=
0x024FF
,
description
=
"
Enclosed Alphanumerics
"
}
,
313
[
"
enclosedalphanumericsupplement
"
]
=
{
first
=
0x1F100
,
last
=
0x1F1FF
,
description
=
"
Enclosed Alphanumeric Supplement
"
}
,
314
[
"
enclosedcjklettersandmonths
"
]
=
{
first
=
0x03200
,
last
=
0x032FF
,
description
=
"
Enclosed CJK Letters and Months
"
}
,
315
[
"
enclosedideographicsupplement
"
]
=
{
first
=
0x1F200
,
last
=
0x1F2FF
,
description
=
"
Enclosed Ideographic Supplement
"
}
,
316
[
"
ethiopic
"
]
=
{
first
=
0x01200
,
last
=
0x0137F
,
otf
=
"
ethi
"
,
description
=
"
Ethiopic
"
}
,
317
[
"
ethiopicextended
"
]
=
{
first
=
0x02D80
,
last
=
0x02DDF
,
otf
=
"
ethi
"
,
description
=
"
Ethiopic Extended
"
}
,
318
[
"
ethiopicextendeda
"
]
=
{
first
=
0x0AB00
,
last
=
0x0AB2F
,
description
=
"
Ethiopic Extended-A
"
}
,
319
[
"
ethiopicsupplement
"
]
=
{
first
=
0x01380
,
last
=
0x0139F
,
otf
=
"
ethi
"
,
description
=
"
Ethiopic Supplement
"
}
,
320
[
"
generalpunctuation
"
]
=
{
first
=
0x02000
,
last
=
0x0206F
,
description
=
"
General Punctuation
"
}
,
321
[
"
geometricshapes
"
]
=
{
first
=
0x025A0
,
last
=
0x025FF
,
math
=
true
,
description
=
"
Geometric Shapes
"
}
,
322
[
"
geometricshapesextended
"
]
=
{
first
=
0x1F780
,
last
=
0x1F7FF
,
description
=
"
Geometric Shapes Extended
"
}
,
323
[
"
georgian
"
]
=
{
first
=
0x010A0
,
last
=
0x010FF
,
otf
=
"
geor
"
,
description
=
"
Georgian
"
}
,
324
[
"
georgianextended
"
]
=
{
first
=
0x01C90
,
last
=
0x01CBF
,
description
=
"
Georgian Extended
"
}
,
325
[
"
georgiansupplement
"
]
=
{
first
=
0x02D00
,
last
=
0x02D2F
,
otf
=
"
geor
"
,
description
=
"
Georgian Supplement
"
}
,
326
[
"
glagolitic
"
]
=
{
first
=
0x02C00
,
last
=
0x02C5F
,
otf
=
"
glag
"
,
description
=
"
Glagolitic
"
}
,
327
[
"
glagoliticsupplement
"
]
=
{
first
=
0x1E000
,
last
=
0x1E02F
,
description
=
"
Glagolitic Supplement
"
}
,
328
[
"
gothic
"
]
=
{
first
=
0x10330
,
last
=
0x1034F
,
otf
=
"
goth
"
,
description
=
"
Gothic
"
}
,
329
[
"
grantha
"
]
=
{
first
=
0x11300
,
last
=
0x1137F
,
description
=
"
Grantha
"
}
,
330
[
"
greekandcoptic
"
]
=
{
first
=
0x00370
,
last
=
0x003FF
,
otf
=
"
grek
"
,
description
=
"
Greek and Coptic
"
}
,
331
[
"
greekextended
"
]
=
{
first
=
0x01F00
,
last
=
0x01FFF
,
otf
=
"
grek
"
,
description
=
"
Greek Extended
"
}
,
332
[
"
gujarati
"
]
=
{
first
=
0x00A80
,
last
=
0x00AFF
,
otf
=
"
gujr
"
,
description
=
"
Gujarati
"
}
,
333
[
"
gunjalagondi
"
]
=
{
first
=
0x11D60
,
last
=
0x11DAF
,
description
=
"
Gunjala Gondi
"
}
,
334
[
"
gurmukhi
"
]
=
{
first
=
0x00A00
,
last
=
0x00A7F
,
otf
=
"
guru
"
,
description
=
"
Gurmukhi
"
}
,
335
[
"
halfwidthandfullwidthforms
"
]
=
{
first
=
0x0FF00
,
last
=
0x0FFEF
,
description
=
"
Halfwidth and Fullwidth Forms
"
}
,
336
[
"
hangulcompatibilityjamo
"
]
=
{
first
=
0x03130
,
last
=
0x0318F
,
otf
=
"
jamo
"
,
description
=
"
Hangul Compatibility Jamo
"
}
,
337
[
"
hanguljamo
"
]
=
{
first
=
0x01100
,
last
=
0x011FF
,
otf
=
"
jamo
"
,
description
=
"
Hangul Jamo
"
}
,
338
[
"
hanguljamoextendeda
"
]
=
{
first
=
0x0A960
,
last
=
0x0A97F
,
description
=
"
Hangul Jamo Extended-A
"
}
,
339
[
"
hanguljamoextendedb
"
]
=
{
first
=
0x0D7B0
,
last
=
0x0D7FF
,
description
=
"
Hangul Jamo Extended-B
"
}
,
340
[
"
hangulsyllables
"
]
=
{
first
=
0x0AC00
,
last
=
0x0D7AF
,
otf
=
"
hang
"
,
description
=
"
Hangul Syllables
"
}
,
341
[
"
hanifirohingya
"
]
=
{
first
=
0x10D00
,
last
=
0x10D3F
,
description
=
"
Hanifi Rohingya
"
}
,
342
[
"
hanunoo
"
]
=
{
first
=
0x01720
,
last
=
0x0173F
,
otf
=
"
hano
"
,
description
=
"
Hanunoo
"
}
,
343
[
"
hatran
"
]
=
{
first
=
0x108E0
,
last
=
0x108FF
,
description
=
"
Hatran
"
}
,
344
[
"
hebrew
"
]
=
{
first
=
0x00590
,
last
=
0x005FF
,
otf
=
"
hebr
"
,
description
=
"
Hebrew
"
}
,
345
[
"
highprivateusesurrogates
"
]
=
{
first
=
0x0DB80
,
last
=
0x0DBFF
,
description
=
"
High Private Use Surrogates
"
}
,
346
[
"
highsurrogates
"
]
=
{
first
=
0x0D800
,
last
=
0x0DB7F
,
description
=
"
High Surrogates
"
}
,
347
[
"
hiragana
"
]
=
{
first
=
0x03040
,
last
=
0x0309F
,
otf
=
"
kana
"
,
description
=
"
Hiragana
"
}
,
348
[
"
ideographicdescriptioncharacters
"
]
=
{
first
=
0x02FF0
,
last
=
0x02FFF
,
description
=
"
Ideographic Description Characters
"
}
,
349
[
"
ideographicsymbolsandpunctuation
"
]
=
{
first
=
0x16FE0
,
last
=
0x16FFF
,
description
=
"
Ideographic Symbols and Punctuation
"
}
,
350
[
"
imperialaramaic
"
]
=
{
first
=
0x10840
,
last
=
0x1085F
,
description
=
"
Imperial Aramaic
"
}
,
351
[
"
indicsiyaqnumbers
"
]
=
{
first
=
0x1EC70
,
last
=
0x1ECBF
,
description
=
"
Indic Siyaq Numbers
"
}
,
352
[
"
inscriptionalpahlavi
"
]
=
{
first
=
0x10B60
,
last
=
0x10B7F
,
description
=
"
Inscriptional Pahlavi
"
}
,
353
[
"
inscriptionalparthian
"
]
=
{
first
=
0x10B40
,
last
=
0x10B5F
,
description
=
"
Inscriptional Parthian
"
}
,
354
[
"
ipaextensions
"
]
=
{
first
=
0x00250
,
last
=
0x002AF
,
description
=
"
IPA Extensions
"
}
,
355
[
"
javanese
"
]
=
{
first
=
0x0A980
,
last
=
0x0A9DF
,
description
=
"
Javanese
"
}
,
356
[
"
kaithi
"
]
=
{
first
=
0x11080
,
last
=
0x110CF
,
description
=
"
Kaithi
"
}
,
357
[
"
kanaextendeda
"
]
=
{
first
=
0x1B100
,
last
=
0x1B12F
,
description
=
"
Kana Extended-A
"
}
,
358
[
"
kanasupplement
"
]
=
{
first
=
0x1B000
,
last
=
0x1B0FF
,
description
=
"
Kana Supplement
"
}
,
359
[
"
kanbun
"
]
=
{
first
=
0x03190
,
last
=
0x0319F
,
description
=
"
Kanbun
"
}
,
360
[
"
kangxiradicals
"
]
=
{
first
=
0x02F00
,
last
=
0x02FDF
,
description
=
"
Kangxi Radicals
"
}
,
361
[
"
kannada
"
]
=
{
first
=
0x00C80
,
last
=
0x00CFF
,
otf
=
"
knda
"
,
description
=
"
Kannada
"
}
,
362
[
"
katakana
"
]
=
{
first
=
0x030A0
,
last
=
0x030FF
,
otf
=
"
kana
"
,
description
=
"
Katakana
"
}
,
363
[
"
katakanaphoneticextensions
"
]
=
{
first
=
0x031F0
,
last
=
0x031FF
,
otf
=
"
kana
"
,
description
=
"
Katakana Phonetic Extensions
"
}
,
364
[
"
kayahli
"
]
=
{
first
=
0x0A900
,
last
=
0x0A92F
,
description
=
"
Kayah Li
"
}
,
365
[
"
kharoshthi
"
]
=
{
first
=
0x10A00
,
last
=
0x10A5F
,
otf
=
"
khar
"
,
description
=
"
Kharoshthi
"
}
,
366
[
"
khitansmallscript
"
]
=
{
first
=
0x18B00
,
last
=
0x18CFF
,
description
=
"
Khitan Small Script
"
}
,
367
[
"
khmer
"
]
=
{
first
=
0x01780
,
last
=
0x017FF
,
otf
=
"
khmr
"
,
description
=
"
Khmer
"
}
,
368
[
"
khmersymbols
"
]
=
{
first
=
0x019E0
,
last
=
0x019FF
,
otf
=
"
khmr
"
,
description
=
"
Khmer Symbols
"
}
,
369
[
"
khojki
"
]
=
{
first
=
0x11200
,
last
=
0x1124F
,
description
=
"
Khojki
"
}
,
370
[
"
khudawadi
"
]
=
{
first
=
0x112B0
,
last
=
0x112FF
,
description
=
"
Khudawadi
"
}
,
371
[
"
lao
"
]
=
{
first
=
0x00E80
,
last
=
0x00EFF
,
otf
=
"
lao
"
,
description
=
"
Lao
"
}
,
372
[
"
latinextendeda
"
]
=
{
first
=
0x00100
,
last
=
0x0017F
,
otf
=
"
latn
"
,
description
=
"
Latin Extended-A
"
}
,
373
[
"
latinextendedadditional
"
]
=
{
first
=
0x01E00
,
last
=
0x01EFF
,
otf
=
"
latn
"
,
description
=
"
Latin Extended Additional
"
}
,
374
[
"
latinextendedb
"
]
=
{
first
=
0x00180
,
last
=
0x0024F
,
otf
=
"
latn
"
,
description
=
"
Latin Extended-B
"
}
,
375
[
"
latinextendedc
"
]
=
{
first
=
0x02C60
,
last
=
0x02C7F
,
otf
=
"
latn
"
,
description
=
"
Latin Extended-C
"
}
,
376
[
"
latinextendedd
"
]
=
{
first
=
0x0A720
,
last
=
0x0A7FF
,
otf
=
"
latn
"
,
description
=
"
Latin Extended-D
"
}
,
377
[
"
latinextendede
"
]
=
{
first
=
0x0AB30
,
last
=
0x0AB6F
,
description
=
"
Latin Extended-E
"
}
,
378
[
"
latinsupplement
"
]
=
{
first
=
0x00080
,
last
=
0x000FF
,
otf
=
"
latn
"
,
description
=
"
Latin-1 Supplement
"
}
,
379
[
"
lepcha
"
]
=
{
first
=
0x01C00
,
last
=
0x01C4F
,
description
=
"
Lepcha
"
}
,
380
[
"
letterlikesymbols
"
]
=
{
first
=
0x02100
,
last
=
0x0214F
,
math
=
true
,
description
=
"
Letterlike Symbols
"
}
,
381
[
"
limbu
"
]
=
{
first
=
0x01900
,
last
=
0x0194F
,
otf
=
"
limb
"
,
description
=
"
Limbu
"
}
,
382
[
"
lineara
"
]
=
{
first
=
0x10600
,
last
=
0x1077F
,
description
=
"
Linear A
"
}
,
383
[
"
linearbideograms
"
]
=
{
first
=
0x10080
,
last
=
0x100FF
,
otf
=
"
linb
"
,
description
=
"
Linear B Ideograms
"
}
,
384
[
"
linearbsyllabary
"
]
=
{
first
=
0x10000
,
last
=
0x1007F
,
otf
=
"
linb
"
,
description
=
"
Linear B Syllabary
"
}
,
385
[
"
lisu
"
]
=
{
first
=
0x0A4D0
,
last
=
0x0A4FF
,
description
=
"
Lisu
"
}
,
386
[
"
lisusupplement
"
]
=
{
first
=
0x11FB0
,
last
=
0x11FBF
,
description
=
"
Lisu Supplement
"
}
,
387
[
"
lowercasebold
"
]
=
{
first
=
0x1D41A
,
last
=
0x1D433
,
math
=
true
}
,
388
[
"
lowercaseboldfraktur
"
]
=
{
first
=
0x1D586
,
last
=
0x1D59F
,
math
=
true
}
,
389
[
"
lowercasebolditalic
"
]
=
{
first
=
0x1D482
,
last
=
0x1D49B
,
math
=
true
}
,
390
[
"
lowercaseboldscript
"
]
=
{
first
=
0x1D4EA
,
last
=
0x1D503
,
math
=
true
}
,
391
[
"
lowercasedoublestruck
"
]
=
{
first
=
0x1D552
,
last
=
0x1D56B
,
math
=
true
}
,
392
[
"
lowercasefraktur
"
]
=
{
first
=
0x1D51E
,
last
=
0x1D537
,
math
=
true
}
,
393
[
"
lowercasegreekbold
"
]
=
{
first
=
0x1D6C2
,
last
=
0x1D6DB
,
math
=
true
}
,
394
[
"
lowercasegreekbolditalic
"
]
=
{
first
=
0x1D736
,
last
=
0x1D74F
,
math
=
true
}
,
395
[
"
lowercasegreekitalic
"
]
=
{
first
=
0x1D6FC
,
last
=
0x1D715
,
math
=
true
}
,
396
[
"
lowercasegreeknormal
"
]
=
{
first
=
0x003B1
,
last
=
0x003CA
,
math
=
true
}
,
397
[
"
lowercasegreeksansserifbold
"
]
=
{
first
=
0x1D770
,
last
=
0x1D789
,
math
=
true
}
,
398
[
"
lowercasegreeksansserifbolditalic
"
]
=
{
first
=
0x1D7AA
,
last
=
0x1D7C3
,
math
=
true
}
,
399
[
"
lowercaseitalic
"
]
=
{
first
=
0x1D44E
,
last
=
0x1D467
,
math
=
true
}
,
400
[
"
lowercasemonospace
"
]
=
{
first
=
0x1D68A
,
last
=
0x1D6A3
,
math
=
true
}
,
401
[
"
lowercasenormal
"
]
=
{
first
=
0x00061
,
last
=
0x0007A
,
math
=
true
}
,
402
[
"
lowercasesansserifbold
"
]
=
{
first
=
0x1D5EE
,
last
=
0x1D607
,
math
=
true
}
,
403
[
"
lowercasesansserifbolditalic
"
]
=
{
first
=
0x1D656
,
last
=
0x1D66F
,
math
=
true
}
,
404
[
"
lowercasesansserifitalic
"
]
=
{
first
=
0x1D622
,
last
=
0x1D63B
,
math
=
true
}
,
405
[
"
lowercasesansserifnormal
"
]
=
{
first
=
0x1D5BA
,
last
=
0x1D5D3
,
math
=
true
}
,
406
[
"
lowercasescript
"
]
=
{
first
=
0x1D4B6
,
last
=
0x1D4CF
,
math
=
true
}
,
407
[
"
lowsurrogates
"
]
=
{
first
=
0x0DC00
,
last
=
0x0DFFF
,
description
=
"
Low Surrogates
"
}
,
408
[
"
lycian
"
]
=
{
first
=
0x10280
,
last
=
0x1029F
,
description
=
"
Lycian
"
}
,
409
[
"
lydian
"
]
=
{
first
=
0x10920
,
last
=
0x1093F
,
description
=
"
Lydian
"
}
,
410
[
"
mahajani
"
]
=
{
first
=
0x11150
,
last
=
0x1117F
,
description
=
"
Mahajani
"
}
,
411
[
"
mahjongtiles
"
]
=
{
first
=
0x1F000
,
last
=
0x1F02F
,
description
=
"
Mahjong Tiles
"
}
,
412
[
"
makasar
"
]
=
{
first
=
0x11EE0
,
last
=
0x11EFF
,
description
=
"
Makasar
"
}
,
413
[
"
malayalam
"
]
=
{
first
=
0x00D00
,
last
=
0x00D7F
,
otf
=
"
mlym
"
,
description
=
"
Malayalam
"
}
,
414
[
"
mandaic
"
]
=
{
first
=
0x00840
,
last
=
0x0085F
,
otf
=
"
mand
"
,
description
=
"
Mandaic
"
}
,
415
[
"
manichaean
"
]
=
{
first
=
0x10AC0
,
last
=
0x10AFF
,
description
=
"
Manichaean
"
}
,
416
[
"
marchen
"
]
=
{
first
=
0x11C70
,
last
=
0x11CBF
,
description
=
"
Marchen
"
}
,
417
[
"
masaramgondi
"
]
=
{
first
=
0x11D00
,
last
=
0x11D5F
,
description
=
"
Masaram Gondi
"
}
,
418
[
"
mathematicalalphanumericsymbols
"
]
=
{
first
=
0x1D400
,
last
=
0x1D7FF
,
math
=
true
,
description
=
"
Mathematical Alphanumeric Symbols
"
}
,
419
[
"
mathematicaloperators
"
]
=
{
first
=
0x02200
,
last
=
0x022FF
,
math
=
true
,
description
=
"
Mathematical Operators
"
}
,
420
[
"
mayannumerals
"
]
=
{
first
=
0x1D2E0
,
last
=
0x1D2FF
,
description
=
"
Mayan Numerals
"
}
,
421
[
"
medefaidrin
"
]
=
{
first
=
0x16E40
,
last
=
0x16E9F
,
description
=
"
Medefaidrin
"
}
,
422
[
"
meeteimayek
"
]
=
{
first
=
0x0ABC0
,
last
=
0x0ABFF
,
description
=
"
Meetei Mayek
"
}
,
423
[
"
meeteimayekextensions
"
]
=
{
first
=
0x0AAE0
,
last
=
0x0AAFF
,
description
=
"
Meetei Mayek Extensions
"
}
,
424
[
"
mendekikakui
"
]
=
{
first
=
0x1E800
,
last
=
0x1E8DF
,
description
=
"
Mende Kikakui
"
}
,
425
[
"
meroiticcursive
"
]
=
{
first
=
0x109A0
,
last
=
0x109FF
,
description
=
"
Meroitic Cursive
"
}
,
426
[
"
meroitichieroglyphs
"
]
=
{
first
=
0x10980
,
last
=
0x1099F
,
description
=
"
Meroitic Hieroglyphs
"
}
,
427
[
"
miao
"
]
=
{
first
=
0x16F00
,
last
=
0x16F9F
,
description
=
"
Miao
"
}
,
428
[
"
miscellaneousmathematicalsymbolsa
"
]
=
{
first
=
0x027C0
,
last
=
0x027EF
,
math
=
true
,
description
=
"
Miscellaneous Mathematical Symbols-A
"
}
,
429
[
"
miscellaneousmathematicalsymbolsb
"
]
=
{
first
=
0x02980
,
last
=
0x029FF
,
math
=
true
,
description
=
"
Miscellaneous Mathematical Symbols-B
"
}
,
430
[
"
miscellaneoussymbols
"
]
=
{
first
=
0x02600
,
last
=
0x026FF
,
math
=
true
,
description
=
"
Miscellaneous Symbols
"
}
,
431
[
"
miscellaneoussymbolsandarrows
"
]
=
{
first
=
0x02B00
,
last
=
0x02BFF
,
math
=
true
,
description
=
"
Miscellaneous Symbols and Arrows
"
}
,
432
[
"
miscellaneoussymbolsandpictographs
"
]
=
{
first
=
0x1F300
,
last
=
0x1F5FF
,
description
=
"
Miscellaneous Symbols and Pictographs
"
}
,
433
[
"
miscellaneoustechnical
"
]
=
{
first
=
0x02300
,
last
=
0x023FF
,
math
=
true
,
description
=
"
Miscellaneous Technical
"
}
,
434
[
"
modi
"
]
=
{
first
=
0x11600
,
last
=
0x1165F
,
description
=
"
Modi
"
}
,
435
[
"
modifiertoneletters
"
]
=
{
first
=
0x0A700
,
last
=
0x0A71F
,
description
=
"
Modifier Tone Letters
"
}
,
436
[
"
mongolian
"
]
=
{
first
=
0x01800
,
last
=
0x018AF
,
otf
=
"
mong
"
,
description
=
"
Mongolian
"
}
,
437
[
"
mongoliansupplement
"
]
=
{
first
=
0x11660
,
last
=
0x1167F
,
description
=
"
Mongolian Supplement
"
}
,
438
[
"
mro
"
]
=
{
first
=
0x16A40
,
last
=
0x16A6F
,
description
=
"
Mro
"
}
,
439
[
"
multani
"
]
=
{
first
=
0x11280
,
last
=
0x112AF
,
description
=
"
Multani
"
}
,
440
[
"
musicalsymbols
"
]
=
{
first
=
0x1D100
,
last
=
0x1D1FF
,
otf
=
"
musc
"
,
description
=
"
Musical Symbols
"
}
,
441
[
"
myanmar
"
]
=
{
first
=
0x01000
,
last
=
0x0109F
,
otf
=
"
mymr
"
,
description
=
"
Myanmar
"
}
,
442
[
"
myanmarextendeda
"
]
=
{
first
=
0x0AA60
,
last
=
0x0AA7F
,
description
=
"
Myanmar Extended-A
"
}
,
443
[
"
myanmarextendedb
"
]
=
{
first
=
0x0A9E0
,
last
=
0x0A9FF
,
description
=
"
Myanmar Extended-B
"
}
,
444
[
"
nabataean
"
]
=
{
first
=
0x10880
,
last
=
0x108AF
,
description
=
"
Nabataean
"
}
,
445
[
"
nandinagari
"
]
=
{
first
=
0x119A0
,
last
=
0x119FF
,
description
=
"
Nandinagari
"
}
,
446
[
"
newa
"
]
=
{
first
=
0x11400
,
last
=
0x1147F
,
description
=
"
Newa
"
}
,
447
[
"
newtailue
"
]
=
{
first
=
0x01980
,
last
=
0x019DF
,
description
=
"
New Tai Lue
"
}
,
448
[
"
nko
"
]
=
{
first
=
0x007C0
,
last
=
0x007FF
,
otf
=
"
nko
"
,
description
=
"
NKo
"
}
,
449
[
"
numberforms
"
]
=
{
first
=
0x02150
,
last
=
0x0218F
,
description
=
"
Number Forms
"
}
,
450
[
"
nushu
"
]
=
{
first
=
0x1B170
,
last
=
0x1B2FF
,
description
=
"
Nushu
"
}
,
451
[
"
nyiakengpuachuehmong
"
]
=
{
first
=
0x1E100
,
last
=
0x1E14F
,
description
=
"
Nyiakeng Puachue Hmong
"
}
,
452
[
"
ogham
"
]
=
{
first
=
0x01680
,
last
=
0x0169F
,
otf
=
"
ogam
"
,
description
=
"
Ogham
"
}
,
453
[
"
olchiki
"
]
=
{
first
=
0x01C50
,
last
=
0x01C7F
,
description
=
"
Ol Chiki
"
}
,
454
[
"
oldhungarian
"
]
=
{
first
=
0x10C80
,
last
=
0x10CFF
,
description
=
"
Old Hungarian
"
}
,
455
[
"
olditalic
"
]
=
{
first
=
0x10300
,
last
=
0x1032F
,
otf
=
"
ital
"
,
description
=
"
Old Italic
"
}
,
456
[
"
oldnortharabian
"
]
=
{
first
=
0x10A80
,
last
=
0x10A9F
,
description
=
"
Old North Arabian
"
}
,
457
[
"
oldpermic
"
]
=
{
first
=
0x10350
,
last
=
0x1037F
,
description
=
"
Old Permic
"
}
,
458
[
"
oldpersian
"
]
=
{
first
=
0x103A0
,
last
=
0x103DF
,
otf
=
"
xpeo
"
,
description
=
"
Old Persian
"
}
,
459
[
"
oldsogdian
"
]
=
{
first
=
0x10F00
,
last
=
0x10F2F
,
description
=
"
Old Sogdian
"
}
,
460
[
"
oldsoutharabian
"
]
=
{
first
=
0x10A60
,
last
=
0x10A7F
,
description
=
"
Old South Arabian
"
}
,
461
[
"
oldturkic
"
]
=
{
first
=
0x10C00
,
last
=
0x10C4F
,
description
=
"
Old Turkic
"
}
,
462
[
"
opticalcharacterrecognition
"
]
=
{
first
=
0x02440
,
last
=
0x0245F
,
description
=
"
Optical Character Recognition
"
}
,
463
[
"
oriya
"
]
=
{
first
=
0x00B00
,
last
=
0x00B7F
,
otf
=
"
orya
"
,
description
=
"
Oriya
"
}
,
464
[
"
ornamentaldingbats
"
]
=
{
first
=
0x1F650
,
last
=
0x1F67F
,
description
=
"
Ornamental Dingbats
"
}
,
465
[
"
osage
"
]
=
{
first
=
0x104B0
,
last
=
0x104FF
,
description
=
"
Osage
"
}
,
466
[
"
osmanya
"
]
=
{
first
=
0x10480
,
last
=
0x104AF
,
otf
=
"
osma
"
,
description
=
"
Osmanya
"
}
,
467
[
"
ottomansiyaqnumbers
"
]
=
{
first
=
0x1ED00
,
last
=
0x1ED4F
,
description
=
"
Ottoman Siyaq Numbers
"
}
,
468
[
"
pahawhhmong
"
]
=
{
first
=
0x16B00
,
last
=
0x16B8F
,
description
=
"
Pahawh Hmong
"
}
,
469
[
"
palmyrene
"
]
=
{
first
=
0x10860
,
last
=
0x1087F
,
description
=
"
Palmyrene
"
}
,
470
[
"
paucinhau
"
]
=
{
first
=
0x11AC0
,
last
=
0x11AFF
,
description
=
"
Pau Cin Hau
"
}
,
471
[
"
phagspa
"
]
=
{
first
=
0x0A840
,
last
=
0x0A87F
,
otf
=
"
phag
"
,
description
=
"
Phags-pa
"
}
,
472
[
"
phaistosdisc
"
]
=
{
first
=
0x101D0
,
last
=
0x101FF
,
description
=
"
Phaistos Disc
"
}
,
473
[
"
phoenician
"
]
=
{
first
=
0x10900
,
last
=
0x1091F
,
otf
=
"
phnx
"
,
description
=
"
Phoenician
"
}
,
474
[
"
phoneticextensions
"
]
=
{
first
=
0x01D00
,
last
=
0x01D7F
,
description
=
"
Phonetic Extensions
"
}
,
475
[
"
phoneticextensionssupplement
"
]
=
{
first
=
0x01D80
,
last
=
0x01DBF
,
description
=
"
Phonetic Extensions Supplement
"
}
,
476
[
"
playingcards
"
]
=
{
first
=
0x1F0A0
,
last
=
0x1F0FF
,
description
=
"
Playing Cards
"
}
,
477
[
"
privateusearea
"
]
=
{
first
=
0x0E000
,
last
=
0x0F8FF
,
description
=
"
Private Use Area
"
}
,
478
[
"
psalterpahlavi
"
]
=
{
first
=
0x10B80
,
last
=
0x10BAF
,
description
=
"
Psalter Pahlavi
"
}
,
479
[
"
rejang
"
]
=
{
first
=
0x0A930
,
last
=
0x0A95F
,
description
=
"
Rejang
"
}
,
480
[
"
ruminumeralsymbols
"
]
=
{
first
=
0x10E60
,
last
=
0x10E7F
,
description
=
"
Rumi Numeral Symbols
"
}
,
481
[
"
runic
"
]
=
{
first
=
0x016A0
,
last
=
0x016FF
,
otf
=
"
runr
"
,
description
=
"
Runic
"
}
,
482
[
"
samaritan
"
]
=
{
first
=
0x00800
,
last
=
0x0083F
,
description
=
"
Samaritan
"
}
,
483
[
"
saurashtra
"
]
=
{
first
=
0x0A880
,
last
=
0x0A8DF
,
description
=
"
Saurashtra
"
}
,
484
[
"
sharada
"
]
=
{
first
=
0x11180
,
last
=
0x111DF
,
description
=
"
Sharada
"
}
,
485
[
"
shavian
"
]
=
{
first
=
0x10450
,
last
=
0x1047F
,
otf
=
"
shaw
"
,
description
=
"
Shavian
"
}
,
486
[
"
shorthandformatcontrols
"
]
=
{
first
=
0x1BCA0
,
last
=
0x1BCAF
,
description
=
"
Shorthand Format Controls
"
}
,
487
[
"
siddham
"
]
=
{
first
=
0x11580
,
last
=
0x115FF
,
description
=
"
Siddham
"
}
,
488
[
"
sinhala
"
]
=
{
first
=
0x00D80
,
last
=
0x00DFF
,
otf
=
"
sinh
"
,
description
=
"
Sinhala
"
}
,
489
[
"
sinhalaarchaicnumbers
"
]
=
{
first
=
0x111E0
,
last
=
0x111FF
,
description
=
"
Sinhala Archaic Numbers
"
}
,
490
[
"
smallformvariants
"
]
=
{
first
=
0x0FE50
,
last
=
0x0FE6F
,
description
=
"
Small Form Variants
"
}
,
491
[
"
smallkanaextension
"
]
=
{
first
=
0x1B130
,
last
=
0x1B16F
,
description
=
"
Small Kana Extension
"
}
,
492
[
"
sogdian
"
]
=
{
first
=
0x10F30
,
last
=
0x10F6F
,
description
=
"
Sogdian
"
}
,
493
[
"
sorasompeng
"
]
=
{
first
=
0x110D0
,
last
=
0x110FF
,
description
=
"
Sora Sompeng
"
}
,
494
[
"
soyombo
"
]
=
{
first
=
0x11A50
,
last
=
0x11AAF
,
description
=
"
Soyombo
"
}
,
495
[
"
spacingmodifierletters
"
]
=
{
first
=
0x002B0
,
last
=
0x002FF
,
description
=
"
Spacing Modifier Letters
"
}
,
496
[
"
specials
"
]
=
{
first
=
0x0FFF0
,
last
=
0x0FFFF
,
description
=
"
Specials
"
}
,
497
[
"
sundanese
"
]
=
{
first
=
0x01B80
,
last
=
0x01BBF
,
description
=
"
Sundanese
"
}
,
498
[
"
sundanesesupplement
"
]
=
{
first
=
0x01CC0
,
last
=
0x01CCF
,
description
=
"
Sundanese Supplement
"
}
,
499
[
"
superscriptsandsubscripts
"
]
=
{
first
=
0x02070
,
last
=
0x0209F
,
description
=
"
Superscripts and Subscripts
"
}
,
500
[
"
supplementalarrowsa
"
]
=
{
first
=
0x027F0
,
last
=
0x027FF
,
math
=
true
,
description
=
"
Supplemental Arrows-A
"
}
,
501
[
"
supplementalarrowsb
"
]
=
{
first
=
0x02900
,
last
=
0x0297F
,
math
=
true
,
description
=
"
Supplemental Arrows-B
"
}
,
502
[
"
supplementalarrowsc
"
]
=
{
first
=
0x1F800
,
last
=
0x1F8FF
,
math
=
true
,
description
=
"
Supplemental Arrows-C
"
}
,
503
[
"
supplementalmathematicaloperators
"
]
=
{
first
=
0x02A00
,
last
=
0x02AFF
,
math
=
true
,
description
=
"
Supplemental Mathematical Operators
"
}
,
504
[
"
supplementalpunctuation
"
]
=
{
first
=
0x02E00
,
last
=
0x02E7F
,
description
=
"
Supplemental Punctuation
"
}
,
505
[
"
supplementalsymbolsandpictographs
"
]
=
{
first
=
0x1F900
,
last
=
0x1F9FF
,
description
=
"
Supplemental Symbols and Pictographs
"
}
,
506
[
"
supplementaryprivateuseareaa
"
]
=
{
first
=
0xF0000
,
last
=
0xFFFFF
,
description
=
"
Supplementary Private Use Area-A
"
}
,
507
[
"
supplementaryprivateuseareab
"
]
=
{
first
=
0x100000
,
last
=
0x10FFFF
,
description
=
"
Supplementary Private Use Area-B
"
}
,
508
[
"
suttonsignwriting
"
]
=
{
first
=
0x1D800
,
last
=
0x1DAAF
,
description
=
"
Sutton SignWriting
"
}
,
509
[
"
sylotinagri
"
]
=
{
first
=
0x0A800
,
last
=
0x0A82F
,
otf
=
"
sylo
"
,
description
=
"
Syloti Nagri
"
}
,
510
[
"
symbolsandpictographsextendeda
"
]
=
{
first
=
0x1FA70
,
last
=
0x1FAFF
,
description
=
"
Symbols and Pictographs Extended-A
"
}
,
511
[
"
symbolsforlegacycomputing
"
]
=
{
first
=
0x1FB00
,
last
=
0x1FBFF
,
description
=
"
Symbols for Legacy Computing
"
}
,
512
[
"
syriac
"
]
=
{
first
=
0x00700
,
last
=
0x0074F
,
otf
=
"
syrc
"
,
description
=
"
Syriac
"
}
,
513
[
"
syriacsupplement
"
]
=
{
first
=
0x00860
,
last
=
0x0086F
,
description
=
"
Syriac Supplement
"
}
,
514
[
"
tagalog
"
]
=
{
first
=
0x01700
,
last
=
0x0171F
,
otf
=
"
tglg
"
,
description
=
"
Tagalog
"
}
,
515
[
"
tagbanwa
"
]
=
{
first
=
0x01760
,
last
=
0x0177F
,
otf
=
"
tagb
"
,
description
=
"
Tagbanwa
"
}
,
516
[
"
tags
"
]
=
{
first
=
0xE0000
,
last
=
0xE007F
,
description
=
"
Tags
"
}
,
517
[
"
taile
"
]
=
{
first
=
0x01950
,
last
=
0x0197F
,
otf
=
"
tale
"
,
description
=
"
Tai Le
"
}
,
518
[
"
taitham
"
]
=
{
first
=
0x01A20
,
last
=
0x01AAF
,
description
=
"
Tai Tham
"
}
,
519
[
"
taiviet
"
]
=
{
first
=
0x0AA80
,
last
=
0x0AADF
,
description
=
"
Tai Viet
"
}
,
520
[
"
taixuanjingsymbols
"
]
=
{
first
=
0x1D300
,
last
=
0x1D35F
,
description
=
"
Tai Xuan Jing Symbols
"
}
,
521
[
"
takri
"
]
=
{
first
=
0x11680
,
last
=
0x116CF
,
description
=
"
Takri
"
}
,
522
[
"
tamil
"
]
=
{
first
=
0x00B80
,
last
=
0x00BFF
,
otf
=
"
taml
"
,
description
=
"
Tamil
"
}
,
523
[
"
tamilsupplement
"
]
=
{
first
=
0x11FC0
,
last
=
0x11FFF
,
description
=
"
Tamil Supplement
"
}
,
524
[
"
tangut
"
]
=
{
first
=
0x17000
,
last
=
0x187FF
,
description
=
"
Tangut
"
}
,
525
[
"
tangutsupplement
"
]
=
{
first
=
0x18D00
,
last
=
0x18D8F
,
description
=
"
Tangut Supplement
"
}
,
526
[
"
tangutcomponents
"
]
=
{
first
=
0x18800
,
last
=
0x18AFF
,
description
=
"
Tangut Components
"
}
,
527
[
"
telugu
"
]
=
{
first
=
0x00C00
,
last
=
0x00C7F
,
otf
=
"
telu
"
,
description
=
"
Telugu
"
}
,
528
[
"
thaana
"
]
=
{
first
=
0x00780
,
last
=
0x007BF
,
otf
=
"
thaa
"
,
description
=
"
Thaana
"
}
,
529
[
"
thai
"
]
=
{
first
=
0x00E00
,
last
=
0x00E7F
,
otf
=
"
thai
"
,
description
=
"
Thai
"
}
,
530
[
"
tibetan
"
]
=
{
first
=
0x00F00
,
last
=
0x00FFF
,
otf
=
"
tibt
"
,
description
=
"
Tibetan
"
}
,
531
[
"
tifinagh
"
]
=
{
first
=
0x02D30
,
last
=
0x02D7F
,
otf
=
"
tfng
"
,
description
=
"
Tifinagh
"
}
,
532
[
"
tirhuta
"
]
=
{
first
=
0x11480
,
last
=
0x114DF
,
description
=
"
Tirhuta
"
}
,
533
[
"
transportandmapsymbols
"
]
=
{
first
=
0x1F680
,
last
=
0x1F6FF
,
description
=
"
Transport and Map Symbols
"
}
,
534
[
"
ugaritic
"
]
=
{
first
=
0x10380
,
last
=
0x1039F
,
otf
=
"
ugar
"
,
description
=
"
Ugaritic
"
}
,
535
[
"
unifiedcanadianaboriginalsyllabics
"
]
=
{
first
=
0x01400
,
last
=
0x0167F
,
otf
=
"
cans
"
,
description
=
"
Unified Canadian Aboriginal Syllabics
"
}
,
536
[
"
unifiedcanadianaboriginalsyllabicsextended
"
]
=
{
first
=
0x018B0
,
last
=
0x018FF
,
description
=
"
Unified Canadian Aboriginal Syllabics Extended
"
}
,
537
[
"
uppercasebold
"
]
=
{
first
=
0x1D400
,
last
=
0x1D419
,
math
=
true
}
,
538
[
"
uppercaseboldfraktur
"
]
=
{
first
=
0x1D56C
,
last
=
0x1D585
,
math
=
true
}
,
539
[
"
uppercasebolditalic
"
]
=
{
first
=
0x1D468
,
last
=
0x1D481
,
math
=
true
}
,
540
[
"
uppercaseboldscript
"
]
=
{
first
=
0x1D4D0
,
last
=
0x1D4E9
,
math
=
true
}
,
541
[
"
uppercasedoublestruck
"
]
=
{
first
=
0x1D538
,
last
=
0x1D551
,
math
=
true
}
,
-- gaps are filled in elsewhere
542
[
"
uppercasefraktur
"
]
=
{
first
=
0x1D504
,
last
=
0x1D51D
,
math
=
true
}
,
543
[
"
uppercasegreekbold
"
]
=
{
first
=
0x1D6A8
,
last
=
0x1D6C1
,
math
=
true
}
,
544
[
"
uppercasegreekbolditalic
"
]
=
{
first
=
0x1D71C
,
last
=
0x1D735
,
math
=
true
}
,
545
[
"
uppercasegreekitalic
"
]
=
{
first
=
0x1D6E2
,
last
=
0x1D6FB
,
math
=
true
}
,
546
[
"
uppercasegreeknormal
"
]
=
{
first
=
0x00391
,
last
=
0x003AA
,
math
=
true
}
,
547
[
"
uppercasegreeksansserifbold
"
]
=
{
first
=
0x1D756
,
last
=
0x1D76F
,
math
=
true
}
,
548
[
"
uppercasegreeksansserifbolditalic
"
]
=
{
first
=
0x1D790
,
last
=
0x1D7A9
,
math
=
true
}
,
549
[
"
uppercaseitalic
"
]
=
{
first
=
0x1D434
,
last
=
0x1D44D
,
math
=
true
}
,
550
[
"
uppercasemonospace
"
]
=
{
first
=
0x1D670
,
last
=
0x1D689
,
math
=
true
}
,
551
[
"
uppercasenormal
"
]
=
{
first
=
0x00041
,
last
=
0x0005A
,
math
=
true
}
,
552
[
"
uppercasesansserifbold
"
]
=
{
first
=
0x1D5D4
,
last
=
0x1D5ED
,
math
=
true
}
,
553
[
"
uppercasesansserifbolditalic
"
]
=
{
first
=
0x1D63C
,
last
=
0x1D655
,
math
=
true
}
,
554
[
"
uppercasesansserifitalic
"
]
=
{
first
=
0x1D608
,
last
=
0x1D621
,
math
=
true
}
,
555
[
"
uppercasesansserifnormal
"
]
=
{
first
=
0x1D5A0
,
last
=
0x1D5B9
,
math
=
true
}
,
556
[
"
uppercasescript
"
]
=
{
first
=
0x1D49C
,
last
=
0x1D4B5
,
math
=
true
}
,
557
[
"
vai
"
]
=
{
first
=
0x0A500
,
last
=
0x0A63F
,
description
=
"
Vai
"
}
,
558
[
"
variationselectors
"
]
=
{
first
=
0x0FE00
,
last
=
0x0FE0F
,
description
=
"
Variation Selectors
"
}
,
559
[
"
variationselectorssupplement
"
]
=
{
first
=
0xE0100
,
last
=
0xE01EF
,
description
=
"
Variation Selectors Supplement
"
}
,
560
[
"
vedicextensions
"
]
=
{
first
=
0x01CD0
,
last
=
0x01CFF
,
description
=
"
Vedic Extensions
"
}
,
561
[
"
verticalforms
"
]
=
{
first
=
0x0FE10
,
last
=
0x0FE1F
,
description
=
"
Vertical Forms
"
}
,
562
[
"
wancho
"
]
=
{
first
=
0x1E2C0
,
last
=
0x1E2FF
,
description
=
"
Wancho
"
}
,
563
[
"
warangciti
"
]
=
{
first
=
0x118A0
,
last
=
0x118FF
,
description
=
"
Warang Citi
"
}
,
564
[
"
yezidi
"
]
=
{
first
=
0x10E80
,
last
=
0x10EBF
,
description
=
"
Yezidi
"
}
,
565
[
"
yijinghexagramsymbols
"
]
=
{
first
=
0x04DC0
,
last
=
0x04DFF
,
otf
=
"
yi
"
,
description
=
"
Yijing Hexagram Symbols
"
}
,
566
[
"
yiradicals
"
]
=
{
first
=
0x0A490
,
last
=
0x0A4CF
,
otf
=
"
yi
"
,
description
=
"
Yi Radicals
"
}
,
567
[
"
yisyllables
"
]
=
{
first
=
0x0A000
,
last
=
0x0A48F
,
otf
=
"
yi
"
,
description
=
"
Yi Syllables
"
}
,
568
[
"
zanabazarsquare
"
]
=
{
first
=
0x11A00
,
last
=
0x11A4F
,
description
=
"
Zanabazar Square
"
}
,
569
}
570 571
-- moved from math-act.lua to here:
572 573
-- operators : 0x02200
574
-- symbolsa : 0x02701
575
-- symbolsb : 0x02901
576
-- supplemental : 0x02A00
577 578
blocks
.
lowercaseitalic
.
gaps
=
{
579
[
0x1D455
]
=
0x0210E
,
-- ℎ h
580
}
581 582
blocks
.
uppercasescript
.
gaps
=
{
583
[
0x1D49D
]
=
0x0212C
,
-- ℬ script B
584
[
0x1D4A0
]
=
0x02130
,
-- ℰ script E
585
[
0x1D4A1
]
=
0x02131
,
-- ℱ script F
586
[
0x1D4A3
]
=
0x0210B
,
-- ℋ script H
587
[
0x1D4A4
]
=
0x02110
,
-- ℐ script I
588
[
0x1D4A7
]
=
0x02112
,
-- ℒ script L
589
[
0x1D4A8
]
=
0x02133
,
-- ℳ script M
590
[
0x1D4AD
]
=
0x0211B
,
-- ℛ script R
591
}
592 593
blocks
.
lowercasescript
.
gaps
=
{
594
[
0x1D4BA
]
=
0x0212F
,
-- ℯ script e
595
[
0x1D4BC
]
=
0x0210A
,
-- ℊ script g
596
[
0x1D4C4
]
=
0x02134
,
-- ℴ script o
597
}
598 599
blocks
.
uppercasefraktur
.
gaps
=
{
600
[
0x1D506
]
=
0x0212D
,
-- ℭ fraktur C
601
[
0x1D50B
]
=
0x0210C
,
-- ℌ fraktur H
602
[
0x1D50C
]
=
0x02111
,
-- ℑ fraktur I
603
[
0x1D515
]
=
0x0211C
,
-- ℜ fraktur R
604
[
0x1D51D
]
=
0x02128
,
-- ℨ fraktur Z
605
}
606 607
blocks
.
uppercasedoublestruck
.
gaps
=
{
608
[
0x1D53A
]
=
0x02102
,
-- ℂ bb C
609
[
0x1D53F
]
=
0x0210D
,
-- ℍ bb H
610
[
0x1D545
]
=
0x02115
,
-- ℕ bb N
611
[
0x1D547
]
=
0x02119
,
-- ℙ bb P
612
[
0x1D548
]
=
0x0211A
,
-- ℚ bb Q
613
[
0x1D549
]
=
0x0211D
,
-- ℝ bb R
614
[
0x1D551
]
=
0x02124
,
-- ℤ bb Z
615
}
616 617
characters
.
blocks
=
blocks
618 619
function
characters
.
blockrange
(
name
)
620
local
b
=
blocks
[
name
]
621
if
b
then
622
return
b
.
first
,
b
.
last
623
else
624
return
0
,
0
625
end
626
end
627 628
setmetatableindex
(
blocks
,
function
(
t
,
k
)
-- we could use an intermediate table if called often
629
return
k
and
rawget
(
t
,
lower
(
gsub
(
k
,
"
[^a-zA-Z]
"
,
"
"
)
)
)
630
end
)
631 632
local
otfscripts
=
utilities
.
storage
.
allocate
(
)
633
characters
.
otfscripts
=
otfscripts
634 635
setmetatableindex
(
otfscripts
,
function
(
t
,
unicode
)
636
for
k
,
v
in
next
,
blocks
do
637
local
first
=
v
.
first
638
local
last
=
v
.
last
639
if
unicode
>
=
first
and
unicode
<
=
last
then
640
local
script
=
v
.
otf
or
"
dflt
"
641
for
u
=
first
,
last
do
642
t
[
u
]
=
script
643
end
644
return
script
645
end
646
end
647
-- pretty slow when we're here
648
t
[
unicode
]
=
"
dflt
"
649
return
"
dflt
"
650
end
)
651 652
local
splitter1
=
lpeg
.
splitat
(
S
(
"
:-
"
)
)
653
local
splitter2
=
lpeg
.
splitat
(
S
(
"
+-
"
)
,
true
)
654 655
function
characters
.
getrange
(
name
,
expression
)
-- used in font fallback definitions (name or range)
656
local
range
=
rawget
(
blocks
,
lower
(
gsub
(
name
,
"
[^a-zA-Z0-9]
"
,
"
"
)
)
)
657
if
range
then
658
return
range
.
first
,
range
.
last
,
range
.
description
,
range
.
gaps
659
end
660
name
=
gsub
(
name
,
'
"
'
,
"
0x
"
)
-- goodie: tex hex notation
661
local
start
,
stop
662
if
expression
then
663
local
n
=
tonumber
(
name
)
664
if
n
then
665
return
n
,
n
,
nil
666
else
667
local
first
,
rest
=
lpegmatch
(
splitter2
,
name
)
668
local
range
=
rawget
(
blocks
,
lower
(
gsub
(
first
,
"
[^a-zA-Z0-9]
"
,
"
"
)
)
)
669
if
range
then
670
local
s
=
loadstring
(
"
return 0
"
.
.
rest
)
671
if
type
(
s
)
=
=
"
function
"
then
672
local
d
=
s
(
)
673
if
type
(
d
)
=
=
"
number
"
then
674
return
range
.
first
+
d
,
range
.
last
+
d
,
nil
675
end
676
end
677
end
678
end
679
end
680
local
start
,
stop
=
lpegmatch
(
splitter1
,
name
)
681
if
start
and
stop
then
682
start
=
tonumber
(
start
,
16
)
or
tonumber
(
start
)
683
stop
=
tonumber
(
stop
,
16
)
or
tonumber
(
stop
)
684
if
start
and
stop
then
685
return
start
,
stop
,
nil
686
end
687
end
688
local
slot
=
tonumber
(
name
,
16
)
or
tonumber
(
name
)
689
return
slot
,
slot
,
nil
690
end
691 692
-- print(characters.getrange("lowercaseitalic + 123",true))
693
-- print(characters.getrange("lowercaseitalic + 124",true))
694 695
local
categorytags
=
allocate
{
696
lu
=
"
Letter Uppercase
"
,
697
ll
=
"
Letter Lowercase
"
,
698
lt
=
"
Letter Titlecase
"
,
699
lm
=
"
Letter Modifier
"
,
700
lo
=
"
Letter Other
"
,
701
mn
=
"
Mark Nonspacing
"
,
702
mc
=
"
Mark Spacing Combining
"
,
703
me
=
"
Mark Enclosing
"
,
704
nd
=
"
Number Decimal Digit
"
,
705
nl
=
"
Number Letter
"
,
706
no
=
"
Number Other
"
,
707
pc
=
"
Punctuation Connector
"
,
708
pd
=
"
Punctuation Dash
"
,
709
ps
=
"
Punctuation Open
"
,
710
pe
=
"
Punctuation Close
"
,
711
pi
=
"
Punctuation Initial Quote
"
,
712
pf
=
"
Punctuation Final Quote
"
,
713
po
=
"
Punctuation Other
"
,
714
sm
=
"
Symbol Math
"
,
715
sc
=
"
Symbol Currency
"
,
716
sk
=
"
Symbol Modifier
"
,
717
so
=
"
Symbol Other
"
,
718
zs
=
"
Separator Space
"
,
719
zl
=
"
Separator Line
"
,
720
zp
=
"
Separator Paragraph
"
,
721
cc
=
"
Other Control
"
,
722
cf
=
"
Other Format
"
,
723
cs
=
"
Other Surrogate
"
,
724
co
=
"
Other Private Use
"
,
725
cn
=
"
Other Not Assigned
"
,
726
}
727 728
local
detailtags
=
allocate
{
729
sl
=
"
small letter
"
,
730
bl
=
"
big letter
"
,
731
im
=
"
iteration mark
"
,
732
pm
=
"
prolonged sound mark
"
733
}
734 735
characters
.
categorytags
=
categorytags
736
characters
.
detailtags
=
detailtags
737 738
-- sounds : voiced unvoiced semivoiced
739 740
--~ special : cf (softhyphen) zs (emspace)
741
--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
742 743
local
is_character
=
allocate
(
tohash
{
744
"
lu
"
,
"
ll
"
,
"
lt
"
,
"
lm
"
,
"
lo
"
,
745
"
nd
"
,
"
nl
"
,
"
no
"
,
746
"
mn
"
,
747
"
nl
"
,
"
no
"
,
748
"
pc
"
,
"
pd
"
,
"
ps
"
,
"
pe
"
,
"
pi
"
,
"
pf
"
,
"
po
"
,
749
"
sm
"
,
"
sc
"
,
"
sk
"
,
"
so
"
750
}
)
751 752
local
is_letter
=
allocate
(
tohash
{
753
"
ll
"
,
"
lm
"
,
"
lo
"
,
"
lt
"
,
"
lu
"
754
}
)
755 756
local
is_command
=
allocate
(
tohash
{
757
"
cf
"
,
"
zs
"
758
}
)
759 760
local
is_spacing
=
allocate
(
tohash
{
761
"
zs
"
,
"
zl
"
,
"
zp
"
,
762
}
)
763 764
local
is_mark
=
allocate
(
tohash
{
765
"
mn
"
,
"
ms
"
,
766
}
)
767 768
local
is_punctuation
=
allocate
(
tohash
{
769
"
pc
"
,
"
pd
"
,
"
ps
"
,
"
pe
"
,
"
pi
"
,
"
pf
"
,
"
po
"
,
770
}
)
771 772
local
is_symbol
=
allocate
(
tohash
{
773
"
sm
"
,
"
sc
"
,
"
sk
"
,
"
so
"
,
774
}
)
775 776
-- to be redone: store checked characters
777 778
characters
.
is_character
=
is_character
779
characters
.
is_letter
=
is_letter
780
characters
.
is_command
=
is_command
781
characters
.
is_spacing
=
is_spacing
782
characters
.
is_mark
=
is_mark
783
characters
.
is_punctuation
=
is_punctuation
784
characters
.
is_symbol
=
is_symbol
785 786
local
mti
=
function
(
t
,
k
)
787
if
type
(
k
)
=
=
"
number
"
then
788
local
c
=
data
[
k
]
.
category
789
return
c
and
rawget
(
t
,
c
)
790
else
791
-- avoid auto conversion in data.characters lookups
792
end
793
end
794 795
setmetatableindex
(
characters
.
is_character
,
mti
)
796
setmetatableindex
(
characters
.
is_letter
,
mti
)
797
setmetatableindex
(
characters
.
is_command
,
mti
)
798
setmetatableindex
(
characters
.
is_spacing
,
mti
)
799
setmetatableindex
(
characters
.
is_punctuation
,
mti
)
800 801
-- todo: also define callers for the above
802 803
-- linebreak: todo: hash
804
--
805
-- normative : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
806
-- informative : XX OP CL CP QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 HL CJ RI
807
--
808
-- comments taken from standard:
809 810
characters
.
linebreaks
=
allocate
{
811 812
-- non-tailorable line breaking classes
813 814
[
"
bk
"
]
=
"
mandatory break
"
,
-- nl, ps : cause a line break (after)
815
[
"
cr
"
]
=
"
carriage return
"
,
-- cr : cause a line break (after), except between cr and lf
816
[
"
lf
"
]
=
"
line feed
"
,
-- lf : cause a line break (after)
817
[
"
cm
"
]
=
"
combining mark
"
,
-- combining marks, control codes : prohibit a line break between the character and the preceding character
818
[
"
nl
"
]
=
"
next line
"
,
-- nel : cause a line break (after)
819
[
"
sg
"
]
=
"
surrogate
"
,
-- surrogates :do not occur in well-formed text
820
[
"
wj
"
]
=
"
word joiner
"
,
-- wj : prohibit line breaks before and after
821
[
"
zw
"
]
=
"
zero width space
"
,
-- zwsp : provide a break opportunity
822
[
"
gl
"
]
=
"
non-breaking (glue)
"
,
-- cgj, nbsp, zwnbsp : prohibit line breaks before and after
823
[
"
sp
"
]
=
"
space
"
,
-- space : enable indirect line breaks
824
[
"
zwj
"
]
=
"
zero width joiner
"
,
-- prohibit line breaks within joiner sequences
825 826
-- break opportunities
827 828
[
"
b2
"
]
=
"
break opportunity before and after
"
,
-- em dash : provide a line break opportunity before and after the character
829
[
"
ba
"
]
=
"
break after
"
,
-- spaces, hyphens : generally provide a line break opportunity after the character
830
[
"
bb
"
]
=
"
break before
"
,
-- punctuation used in dictionaries : generally provide a line break opportunity before the character
831
[
"
hy
"
]
=
"
hyphen
"
,
-- hyphen-minus : provide a line break opportunity after the character, except in numeric context
832
[
"
cb
"
]
=
"
contingent break opportunity
"
,
-- inline objects : provide a line break opportunity contingent on additional information
833 834
-- characters prohibiting certain breaks
835 836
[
"
cl
"
]
=
"
close punctuation
"
,
-- “}”, “❳”, “⟫” etc. : prohibit line breaks before
837
[
"
cp
"
]
=
"
close parenthesis
"
,
-- “)”, “]” : prohibit line breaks before
838
[
"
ex
"
]
=
"
exclamation/interrogation
"
,
-- “!”, “?”, etc. : prohibit line breaks before
839
[
"
in
"
]
=
"
inseparable
"
,
-- leaders : allow only indirect line breaks between pairs
840
[
"
ns
"
]
=
"
nonstarter
"
,
-- “‼”, “‽”, “⁇”, “⁉”, etc. : allow only indirect line breaks before
841
[
"
op
"
]
=
"
open punctuation
"
,
-- “(“, “[“, “{“, etc. : prohibit line breaks after
842
[
"
qu
"
]
=
"
quotation
"
,
-- quotation marks : act like they are both opening and closing
843 844
-- numeric context
845 846
[
"
is
"
]
=
"
infix numeric separator
"
,
-- . , : prevent breaks after any and before numeric
847
[
"
nu
"
]
=
"
numeric
"
,
-- digits : form numeric expressions for line breaking purposes
848
[
"
po
"
]
=
"
postfix numeric
"
,
-- %, ¢ : do not break following a numeric expression
849
[
"
pr
"
]
=
"
prefix numeric
"
,
-- $, £, ¥, etc. : do not break in front of a numeric expression
850
[
"
sy
"
]
=
"
symbols allowing break after
"
,
-- / : prevent a break before, and allow a break after
851 852
-- other characters
853 854
[
"
ai
"
]
=
"
ambiguous (alphabetic or ideographic)
"
,
-- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
855
[
"
al
"
]
=
"
alphabetic
"
,
-- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
856
[
"
cj
"
]
=
"
conditional japanese starter
"
,
-- small kana : treat as ns or id for strict or normal breaking.
857
[
"
eb
"
]
=
"
emoji base
"
,
-- all emoji allowing modifiers, do not break from following emoji modifier
858
[
"
em
"
]
=
"
emoji modifier
"
,
-- skin tone modifiers, do not break from preceding emoji base
859
[
"
h2
"
]
=
"
hangul lv syllable
"
,
-- hangul : form korean syllable blocks
860
[
"
h3
"
]
=
"
hangul lvt syllable
"
,
-- hangul : form korean syllable blocks
861
[
"
hl
"
]
=
"
hebrew letter
"
,
-- hebrew : do not break around a following hyphen; otherwise act as alphabetic
862
[
"
id
"
]
=
"
ideographic
"
,
-- ideographs : break before or after, except in some numeric context
863
[
"
jl
"
]
=
"
hangul l jamo
"
,
-- conjoining jamo : form korean syllable blocks
864
[
"
jv
"
]
=
"
hangul v jamo
"
,
-- conjoining jamo : form korean syllable blocks
865
[
"
jt
"
]
=
"
hangul t jamo
"
,
-- conjoining jamo : form korean syllable blocks
866
[
"
ri
"
]
=
"
regional indicator
"
,
-- regional indicator symbol letter a .. z : keep together, break before and after from others
867
[
"
sa
"
]
=
"
complex context dependent (south east asian)
"
,
-- south east asian: thai, lao, khmer : provide a line break opportunity contingent on additional, language-specific context analysis
868
[
"
xx
"
]
=
"
unknown
"
,
-- most unassigned, private-use : have as yet unknown line breaking behavior or unassigned code positions
869 870
}
871 872
-- east asian width:
873
--
874
-- N A H W F Na
875 876
characters
.
bidi
=
allocate
{
877
l
=
"
Left-to-Right
"
,
878
lre
=
"
Left-to-Right Embedding
"
,
879
lro
=
"
Left-to-Right Override
"
,
880
r
=
"
Right-to-Left
"
,
881
al
=
"
Right-to-Left Arabic
"
,
882
rle
=
"
Right-to-Left Embedding
"
,
883
rlo
=
"
Right-to-Left Override
"
,
884
pdf
=
"
Pop Directional Format
"
,
885
en
=
"
European Number
"
,
886
es
=
"
European Number Separator
"
,
887
et
=
"
European Number Terminator
"
,
888
an
=
"
Arabic Number
"
,
889
cs
=
"
Common Number Separator
"
,
890
nsm
=
"
Non-Spacing Mark
"
,
891
bn
=
"
Boundary Neutral
"
,
892
b
=
"
Paragraph Separator
"
,
893
s
=
"
Segment Separator
"
,
894
ws
=
"
Whitespace
"
,
895
on
=
"
Other Neutrals
"
,
896
}
897 898
--[[ldx-- 899<p>At this point we assume that the big data table is loaded. From this 900table we derive a few more.</p> 901--ldx]]
--
902 903
if
not
characters
.
fallbacks
then
904 905
characters
.
fallbacks
=
allocate
{
906
[
0x0308
]
=
0x00A8
,
[
0x00A8
]
=
0x0308
,
-- dieresiscmb dieresis
907
[
0x0304
]
=
0x00AF
,
[
0x00AF
]
=
0x0304
,
-- macroncmb macron
908
[
0x0301
]
=
0x00B4
,
[
0x00B4
]
=
0x0301
,
-- acutecomb acute
909
[
0x0327
]
=
0x00B8
,
[
0x00B8
]
=
0x0327
,
-- cedillacmb cedilla
910
[
0x0302
]
=
0x02C6
,
[
0x02C6
]
=
0x0302
,
-- circumflexcmb circumflex
911
[
0x030C
]
=
0x02C7
,
[
0x02C7
]
=
0x030C
,
-- caroncmb caron
912
[
0x0306
]
=
0x02D8
,
[
0x02D8
]
=
0x0306
,
-- brevecmb breve
913
[
0x0307
]
=
0x02D9
,
[
0x02D9
]
=
0x0307
,
-- dotaccentcmb dotaccent
914
[
0x030A
]
=
0x02DA
,
[
0x02DA
]
=
0x030A
,
-- ringcmb ring
915
[
0x0328
]
=
0x02DB
,
[
0x02DB
]
=
0x0328
,
-- ogonekcmb ogonek
916
[
0x0303
]
=
0x02DC
,
[
0x02DC
]
=
0x0303
,
-- tildecomb tilde
917
[
0x030B
]
=
0x02DD
,
[
0x02DD
]
=
0x030B
,
-- hungarumlautcmb hungarumlaut
918
[
0x0305
]
=
0x203E
,
[
0x203E
]
=
0x0305
,
-- overlinecmb overline
919
[
0x0300
]
=
0x0060
,
[
0x0060
]
=
0x0333
,
-- gravecomb grave
920
}
921 922
-- not done (would mess up mapping):
923
--
924
-- 0X0301/0X0384 0X0314/0X1FFE 0X0313/0X1FBD 0X0313/0X1FBF 0X0342/0X1FC0
925
-- 0X3099/0X309B 0X309A/0X309C 0X0333/0X2017 0X0345/0X037A
926 927
end
928 929
if
storage
then
930
storage
.
register
(
"
characters/fallbacks
"
,
characters
.
fallbacks
,
"
characters.fallbacks
"
)
-- accents and such
931
end
932 933
characters
.
directions
=
{
}
934 935
setmetatableindex
(
characters
.
directions
,
function
(
t
,
k
)
936
local
d
=
data
[
k
]
937
if
d
then
938
local
v
=
d
.
direction
939
if
v
then
940
t
[
k
]
=
v
941
return
v
942
end
943
end
944
t
[
k
]
=
false
-- maybe 'l'
945
return
false
946
end
)
947 948
characters
.
mirrors
=
{
}
949 950
setmetatableindex
(
characters
.
mirrors
,
function
(
t
,
k
)
951
local
d
=
data
[
k
]
952
if
d
then
953
local
v
=
d
.
mirror
954
if
v
then
955
t
[
k
]
=
v
956
return
v
957
end
958
end
959
t
[
k
]
=
false
960
return
false
961
end
)
962 963
characters
.
textclasses
=
{
}
964 965
setmetatableindex
(
characters
.
textclasses
,
function
(
t
,
k
)
966
local
d
=
data
[
k
]
967
if
d
then
968
local
v
=
d
.
textclass
969
if
v
then
970
t
[
k
]
=
v
971
return
v
972
end
973
end
974
t
[
k
]
=
false
975
return
false
976
end
)
977 978
--[[ldx-- 979<p>Next comes a whole series of helper methods. These are (will be) part 980of the official <l n='api'/>.</p> 981--ldx]]
--
982 983
-- we could make them virtual: characters.contextnames[n]
984 985
function
characters
.
contextname
(
n
)
return
data
[
n
]
and
data
[
n
]
.
contextname
or
"
"
end
986
function
characters
.
adobename
(
n
)
return
data
[
n
]
and
data
[
n
]
.
adobename
or
"
"
end
987
function
characters
.
description
(
n
)
return
data
[
n
]
and
data
[
n
]
.
description
or
"
"
end
988
-------- characters.category (n) return data[n] and data[n].category or "" end
989 990
function
characters
.
category
(
n
,
verbose
)
991
local
c
=
data
[
n
]
.
category
992
if
not
c
then
993
return
"
"
994
elseif
verbose
then
995
return
categorytags
[
c
]
996
else
997
return
c
998
end
999
end
1000 1001
-- -- some day we will make a table .. not that many calls to utfchar
1002
--
1003
-- local utfchar = utf.char
1004
-- local utfbyte = utf.byte
1005
-- local utfbytes = { }
1006
-- local utfchars = { }
1007
--
1008
-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
1009
-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
1010 1011
local
function
toutfstring
(
s
)
1012
if
type
(
s
)
=
=
"
table
"
then
1013
return
utfchar
(
unpack
(
s
)
)
-- concat { utfchar( unpack(s) ) }
1014
else
1015
return
utfchar
(
s
)
1016
end
1017
end
1018 1019
utf
.
tostring
=
toutfstring
1020 1021
local
categories
=
allocate
(
)
characters
.
categories
=
categories
-- lazy table
1022 1023
setmetatableindex
(
categories
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
category
or
u
t
[
u
]
=
c
return
c
end
end
)
1024 1025
-- todo: overloads (these register directly in the tables as number and string) e.g. for greek
1026
-- todo: for string do a numeric lookup in the table itself
1027 1028
local
lccodes
=
allocate
(
)
characters
.
lccodes
=
lccodes
-- lazy table
1029
local
uccodes
=
allocate
(
)
characters
.
uccodes
=
uccodes
-- lazy table
1030
local
shcodes
=
allocate
(
)
characters
.
shcodes
=
shcodes
-- lazy table
1031
local
fscodes
=
allocate
(
)
characters
.
fscodes
=
fscodes
-- lazy table
1032 1033
setmetatableindex
(
lccodes
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
lccode
or
(
type
(
u
)
=
=
"
string
"
and
utfbyte
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1034
setmetatableindex
(
uccodes
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
uccode
or
(
type
(
u
)
=
=
"
string
"
and
utfbyte
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1035
setmetatableindex
(
shcodes
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
shcode
or
(
type
(
u
)
=
=
"
string
"
and
utfbyte
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1036
setmetatableindex
(
fscodes
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
fscode
or
(
type
(
u
)
=
=
"
string
"
and
utfbyte
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1037 1038
local
lcchars
=
allocate
(
)
characters
.
lcchars
=
lcchars
-- lazy table
1039
local
ucchars
=
allocate
(
)
characters
.
ucchars
=
ucchars
-- lazy table
1040
local
shchars
=
allocate
(
)
characters
.
shchars
=
shchars
-- lazy table
1041
local
fschars
=
allocate
(
)
characters
.
fschars
=
fschars
-- lazy table
1042 1043
setmetatableindex
(
lcchars
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
lccode
c
=
c
and
toutfstring
(
c
)
or
(
type
(
u
)
=
=
"
number
"
and
utfchar
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1044
setmetatableindex
(
ucchars
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
uccode
c
=
c
and
toutfstring
(
c
)
or
(
type
(
u
)
=
=
"
number
"
and
utfchar
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1045
setmetatableindex
(
shchars
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
shcode
c
=
c
and
toutfstring
(
c
)
or
(
type
(
u
)
=
=
"
number
"
and
utfchar
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1046
setmetatableindex
(
fschars
,
function
(
t
,
u
)
if
u
then
local
c
=
data
[
u
]
c
=
c
and
c
.
fscode
c
=
c
and
toutfstring
(
c
)
or
(
type
(
u
)
=
=
"
number
"
and
utfchar
(
u
)
)
or
u
t
[
u
]
=
c
return
c
end
end
)
1047 1048
local
decomposed
=
allocate
(
)
characters
.
decomposed
=
decomposed
-- lazy table
1049
local
specials
=
allocate
(
)
characters
.
specials
=
specials
-- lazy table
1050 1051
setmetatableindex
(
decomposed
,
function
(
t
,
u
)
-- either a table or false
1052
if
u
then
1053
local
c
=
data
[
u
]
1054
local
s
=
c
and
c
.
decomposed
or
false
-- could fall back to specials
1055
t
[
u
]
=
s
1056
return
s
1057
end
1058
end
)
1059 1060
setmetatableindex
(
specials
,
function
(
t
,
u
)
-- either a table or false
1061
if
u
then
1062
local
c
=
data
[
u
]
1063
local
s
=
c
and
c
.
specials
or
false
1064
t
[
u
]
=
s
1065
return
s
1066
end
1067
end
)
1068 1069
local
specialchars
=
allocate
(
)
characters
.
specialchars
=
specialchars
-- lazy table
1070
local
descriptions
=
allocate
(
)
characters
.
descriptions
=
descriptions
-- lazy table
1071
local
synonyms
=
allocate
(
)
characters
.
synonyms
=
synonyms
-- lazy table
1072 1073
setmetatableindex
(
specialchars
,
function
(
t
,
u
)
1074
if
u
then
1075
local
c
=
data
[
u
]
1076
local
s
=
c
and
c
.
specials
1077
if
s
then
1078
local
tt
=
{
}
1079
local
ttn
=
0
1080
for
i
=
2
,
#
s
do
1081
local
si
=
s
[
i
]
1082
local
c
=
data
[
si
]
1083
if
is_letter
[
c
.
category
]
then
1084
ttn
=
ttn
+
1
1085
tt
[
ttn
]
=
utfchar
(
si
)
1086
end
1087
end
1088
c
=
concat
(
tt
)
1089
t
[
u
]
=
c
1090
return
c
1091
else
1092
if
type
(
u
)
=
=
"
number
"
then
1093
u
=
utfchar
(
u
)
1094
end
1095
t
[
u
]
=
u
1096
return
u
1097
end
1098
end
1099
end
)
1100 1101
setmetatableindex
(
descriptions
,
function
(
t
,
k
)
1102
-- 0.05 - 0.10 sec
1103
for
u
,
c
in
next
,
data
do
1104
local
d
=
c
.
description
1105
if
d
then
1106
if
find
(
d
,
"
"
,
1
,
true
)
then
1107
d
=
gsub
(
d
,
"
"
,
"
"
)
1108
end
1109
d
=
lower
(
d
)
1110
t
[
d
]
=
u
1111
end
1112
end
1113
local
d
=
rawget
(
t
,
k
)
1114
if
not
d
then
1115
t
[
k
]
=
k
1116
end
1117
return
d
1118
end
)
1119 1120
setmetatableindex
(
synonyms
,
function
(
t
,
k
)
1121
for
u
,
c
in
next
,
data
do
1122
local
s
=
c
.
synonyms
1123
if
s
then
1124
if
find
(
s
,
"
"
,
1
,
true
)
then
1125
s
=
gsub
(
s
,
"
"
,
"
"
)
1126
end
1127
-- s = lower(s) -- is already lowercase
1128
t
[
s
]
=
u
1129
end
1130
end
1131
local
s
=
rawget
(
t
,
k
)
1132
if
not
s
then
1133
t
[
s
]
=
s
1134
end
1135
return
s
1136
end
)
1137 1138
function
characters
.
unicodechar
(
asked
)
1139
local
n
=
tonumber
(
asked
)
1140
if
n
then
1141
return
n
1142
elseif
type
(
asked
)
=
=
"
string
"
then
1143
return
descriptions
[
asked
]
or
descriptions
[
gsub
(
asked
,
"
"
,
"
"
)
]
1144
end
1145
end
1146 1147
-- function characters.lower(str)
1148
-- local new, n = { }, 0
1149
-- for u in utfvalues(str) do
1150
-- n = n + 1
1151
-- new[n] = lcchars[u]
1152
-- end
1153
-- return concat(new)
1154
-- end
1155
--
1156
-- function characters.upper(str)
1157
-- local new, n = { }, 0
1158
-- for u in utfvalues(str) do
1159
-- n = n + 1
1160
-- new[n] = ucchars[u]
1161
-- end
1162
-- return concat(new)
1163
-- end
1164
--
1165
-- function characters.shaped(str)
1166
-- local new, n = { }, 0
1167
-- for u in utfvalues(str) do
1168
-- n = n + 1
1169
-- new[n] = shchars[u]
1170
-- end
1171
-- return concat(new)
1172
-- end
1173 1174
----- tolower = Cs((utf8byte/lcchars)^0)
1175
----- toupper = Cs((utf8byte/ucchars)^0)
1176
----- toshape = Cs((utf8byte/shchars)^0)
1177 1178
local
tolower
=
Cs
(
(
utf8character
/
lcchars
)
^
0
)
-- no need to check spacing
1179
local
toupper
=
Cs
(
(
utf8character
/
ucchars
)
^
0
)
-- no need to check spacing
1180
local
toshape
=
Cs
(
(
utf8character
/
shchars
)
^
0
)
-- no need to check spacing
1181 1182
lpegpatterns
.
tolower
=
tolower
-- old ones ... will be overloaded
1183
lpegpatterns
.
toupper
=
toupper
-- old ones ... will be overloaded
1184
lpegpatterns
.
toshape
=
toshape
-- old ones ... will be overloaded
1185 1186
-- function characters.lower (str) return lpegmatch(tolower,str) end
1187
-- function characters.upper (str) return lpegmatch(toupper,str) end
1188
-- function characters.shaped(str) return lpegmatch(toshape,str) end
1189 1190
-- local superscripts = allocate() characters.superscripts = superscripts
1191
-- local subscripts = allocate() characters.subscripts = subscripts
1192 1193
-- if storage then
1194
-- storage.register("characters/superscripts", superscripts, "characters.superscripts")
1195
-- storage.register("characters/subscripts", subscripts, "characters.subscripts")
1196
-- end
1197 1198
-- end
1199 1200
if
not
characters
.
splits
then
1201 1202
local
char
=
allocate
(
)
1203
local
compat
=
allocate
(
)
1204 1205
local
splits
=
{
1206
char
=
char
,
1207
compat
=
compat
,
1208
}
1209 1210
characters
.
splits
=
splits
1211 1212
-- [0x013F] = { 0x004C, 0x00B7 }
1213
-- [0x0140] = { 0x006C, 0x00B7 }
1214 1215
for
unicode
,
data
in
next
,
characters
.
data
do
1216
local
specials
=
data
.
specials
1217
if
specials
and
#
specials
>
2
then
1218
local
kind
=
specials
[
1
]
1219
if
kind
=
=
"
compat
"
then
1220
compat
[
unicode
]
=
{
unpack
(
specials
,
2
)
}
1221
elseif
kind
=
=
"
char
"
then
1222
char
[
unicode
]
=
{
unpack
(
specials
,
2
)
}
1223
end
1224
end
1225
end
1226 1227
if
storage
then
1228
storage
.
register
(
"
characters/splits
"
,
splits
,
"
characters.splits
"
)
1229
end
1230 1231
end
1232 1233
if
not
characters
.
lhash
then
1234 1235
local
lhash
=
allocate
(
)
characters
.
lhash
=
lhash
-- nil if no conversion
1236
local
uhash
=
allocate
(
)
characters
.
uhash
=
uhash
-- nil if no conversion
1237
local
shash
=
allocate
(
)
characters
.
shash
=
shash
-- nil if no conversion
1238 1239
for
k
,
v
in
next
,
characters
.
data
do
1240
-- if k < 0x11000 then
1241
local
l
=
v
.
lccode
1242
if
l
then
1243
-- we have an uppercase
1244
if
type
(
l
)
=
=
"
number
"
then
1245
lhash
[
utfchar
(
k
)
]
=
utfchar
(
l
)
1246
elseif
#
l
=
=
2
then
1247
lhash
[
utfchar
(
k
)
]
=
utfchar
(
l
[
1
]
)
.
.
utfchar
(
l
[
2
]
)
1248
-- else
1249
-- inspect(v)
1250
end
1251
else
1252
local
u
=
v
.
uccode
1253
if
u
then
1254
-- we have an lowercase
1255
if
type
(
u
)
=
=
"
number
"
then
1256
uhash
[
utfchar
(
k
)
]
=
utfchar
(
u
)
1257
elseif
#
u
=
=
2
then
1258
uhash
[
utfchar
(
k
)
]
=
utfchar
(
u
[
1
]
)
.
.
utfchar
(
u
[
2
]
)
1259
-- else
1260
-- inspect(v)
1261
end
1262
end
1263
end
1264
local
s
=
v
.
shcode
1265
if
s
then
1266
if
type
(
s
)
=
=
"
number
"
then
1267
shash
[
utfchar
(
k
)
]
=
utfchar
(
s
)
1268
elseif
#
s
=
=
2
then
1269
shash
[
utfchar
(
k
)
]
=
utfchar
(
s
[
1
]
)
.
.
utfchar
(
s
[
2
]
)
1270
-- else
1271
-- inspect(v)
1272
end
1273
end
1274
-- end
1275
end
1276 1277
if
storage
then
1278
storage
.
register
(
"
characters/lhash
"
,
lhash
,
"
characters.lhash
"
)
1279
storage
.
register
(
"
characters/uhash
"
,
uhash
,
"
characters.uhash
"
)
1280
storage
.
register
(
"
characters/shash
"
,
shash
,
"
characters.shash
"
)
1281
end
1282 1283
end
1284 1285
local
lhash
=
characters
.
lhash
mark
(
lhash
)
1286
local
uhash
=
characters
.
uhash
mark
(
uhash
)
1287
local
shash
=
characters
.
shash
mark
(
shash
)
1288 1289
local
utf8lowercharacter
=
utfchartabletopattern
(
lhash
)
/
lhash
1290
local
utf8uppercharacter
=
utfchartabletopattern
(
uhash
)
/
uhash
1291
local
utf8shapecharacter
=
utfchartabletopattern
(
shash
)
/
shash
1292 1293
local
utf8lower
=
Cs
(
(
utf8lowercharacter
+
utf8character
)
^
0
)
1294
local
utf8upper
=
Cs
(
(
utf8uppercharacter
+
utf8character
)
^
0
)
1295
local
utf8shape
=
Cs
(
(
utf8shapecharacter
+
utf8character
)
^
0
)
1296 1297
lpegpatterns
.
utf8lowercharacter
=
utf8lowercharacter
-- one character
1298
lpegpatterns
.
utf8uppercharacter
=
utf8uppercharacter
-- one character
1299
lpegpatterns
.
utf8shapecharacter
=
utf8shapecharacter
-- one character
1300 1301
lpegpatterns
.
utf8lower
=
utf8lower
-- string
1302
lpegpatterns
.
utf8upper
=
utf8upper
-- string
1303
lpegpatterns
.
utf8shape
=
utf8shape
-- string
1304 1305
function
characters
.
lower
(
str
)
return
str
and
lpegmatch
(
utf8lower
,
str
)
or
"
"
end
1306
function
characters
.
upper
(
str
)
return
str
and
lpegmatch
(
utf8upper
,
str
)
or
"
"
end
1307
function
characters
.
shaped
(
str
)
return
str
and
lpegmatch
(
utf8shape
,
str
)
or
"
"
end
1308 1309
lpeg
.
setutfcasers
(
characters
.
lower
,
characters
.
upper
)
1310 1311
-- local str = [[
1312
-- ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
1313
-- ÆÇæç æçæç ÆÇÆÇ AECaec
1314
-- ÈÉÊËèéêë èéêëèéêë ÈÉÊËÈÉÊË EEEEeeee
1315
-- ÌÍÎÏÞìíîïþ ìíîïþìíîïþ ÌÍÎÏÞÌÍÎÏÞ IIIIÞiiiiþ
1316
-- Ðð ðð ÐÐ Ðð
1317
-- Ññ ññ ÑÑ Nn
1318
-- ÒÓÔÕÖòóôõö òóôõöòóôõö ÒÓÔÕÖÒÓÔÕÖ OOOOOooooo
1319
-- Øø øø ØØ Oo
1320
-- ÙÚÛÜùúûü ùúûüùúûü ÙÚÛÜÙÚÛÜ UUUUuuuu
1321
-- Ýýÿ ýýÿ ÝÝŸ Yyy
1322
-- ß ß SS ss
1323
-- Ţţ ţţ ŢŢ Tt
1324
-- ]]
1325
--
1326
-- local lower = characters.lower print(lower(str))
1327
-- local upper = characters.upper print(upper(str))
1328
-- local shaped = characters.shaped print(shaped(str))
1329
--
1330
-- local c, n = os.clock(), 10000
1331
-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
1332
-- print(os.clock()-c,n*#str*3)
1333 1334
-- maybe: (twice as fast when much ascii)
1335
--
1336
-- local tolower = lpeg.patterns.tolower
1337
-- local lower = string.lower
1338
--
1339
-- local allascii = R("\000\127")^1 * P(-1)
1340
--
1341
-- function characters.checkedlower(str)
1342
-- return lpegmatch(allascii,str) and lower(str) or lpegmatch(tolower,str) or str
1343
-- end
1344 1345
function
characters
.
lettered
(
str
,
spacing
)
1346
local
new
,
n
=
{
}
,
0
1347
if
spacing
then
1348
local
done
=
false
1349
for
u
in
utfvalues
(
str
)
do
1350
local
c
=
data
[
u
]
.
category
1351
if
is_letter
[
c
]
then
1352
if
done
and
n
>
1
then
1353
n
=
n
+
1
1354
new
[
n
]
=
"
"
1355
done
=
false
1356
end
1357
n
=
n
+
1
1358
new
[
n
]
=
utfchar
(
u
)
1359
elseif
spacing
and
is_spacing
[
c
]
then
1360
done
=
true
1361
end
1362
end
1363
else
1364
for
u
in
utfvalues
(
str
)
do
1365
if
is_letter
[
data
[
u
]
.
category
]
then
1366
n
=
n
+
1
1367
new
[
n
]
=
utfchar
(
u
)
1368
end
1369
end
1370
end
1371
return
concat
(
new
)
1372
end
1373 1374
--[[ldx-- 1375<p>Requesting lower and uppercase codes:</p> 1376--ldx]]
--
1377 1378
function
characters
.
uccode
(
n
)
return
uccodes
[
n
]
end
-- obsolete
1379
function
characters
.
lccode
(
n
)
return
lccodes
[
n
]
end
-- obsolete
1380 1381
function
characters
.
shape
(
n
)
1382
local
shcode
=
shcodes
[
n
]
1383
if
not
shcode
then
1384
return
n
,
nil
1385
elseif
type
(
shcode
)
=
=
"
table
"
then
1386
return
shcode
[
1
]
,
shcode
[
#
shcode
]
1387
else
1388
return
shcode
,
nil
1389
end
1390
end
1391 1392
-- -- some day we might go this route, but it does not really save that much
1393
-- -- so not now (we can generate a lot using mtx-unicode that operates on the
1394
-- -- database)
1395
--
1396
-- -- category cjkwd direction linebreak
1397
--
1398
-- -- adobename comment contextcommand contextname description fallback lccode
1399
-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
1400
-- -- range shcode specials uccode uccodes unicodeslot
1401
--
1402
-- local data = {
1403
-- ['one']={
1404
-- common = {
1405
-- category="cc",
1406
-- direction="bn",
1407
-- linebreak="cm",
1408
-- },
1409
-- vector = {
1410
-- [0x0000] = {
1411
-- description="NULL",
1412
-- group='one',
1413
-- unicodeslot=0x0000,
1414
-- },
1415
-- {
1416
-- description="START OF HEADING",
1417
-- group='one',
1418
-- unicodeslot=0x0001,
1419
-- },
1420
-- }
1421
-- }
1422
-- }
1423
--
1424
-- local chardata, groupdata = { }, { }
1425
--
1426
-- for group, gdata in next, data do
1427
-- local common, vector = { __index = gdata.common }, gdata.vector
1428
-- for character, cdata in next, vector do
1429
-- chardata[character] = cdata
1430
-- setmetatable(cdata,common)
1431
-- end
1432
-- groupdata[group] = gdata
1433
-- end
1434 1435
-- characters.data, characters.groups = chardata, groupdata
1436 1437
-- [0xF0000]={
1438
-- category="co",
1439
-- cjkwd="a",
1440
-- description="<Plane 0x000F Private Use, First>",
1441
-- direction="l",
1442
-- unicodeslot=0xF0000,
1443
-- },
1444
-- [0xFFFFD]={
1445
-- category="co",
1446
-- cjkwd="a",
1447
-- description="<Plane 0x000F Private Use, Last>",
1448
-- direction="l",
1449
-- unicodeslot=0xFFFFD,
1450
-- },
1451
-- [0x100000]={
1452
-- category="co",
1453
-- cjkwd="a",
1454
-- description="<Plane 0x0010 Private Use, First>",
1455
-- direction="l",
1456
-- unicodeslot=0x100000,
1457
-- },
1458
-- [0x10FFFD]={
1459
-- category="co",
1460
-- cjkwd="a",
1461
-- description="<Plane 0x0010 Private Use, Last>",
1462
-- direction="l",
1463
-- unicodeslot=0x10FFFD,
1464
-- },
1465 1466
if
not
characters
.
superscripts
then
1467 1468
local
superscripts
=
allocate
(
)
characters
.
superscripts
=
superscripts
1469
local
subscripts
=
allocate
(
)
characters
.
subscripts
=
subscripts
1470
local
fractions
=
allocate
(
)
characters
.
fractions
=
fractions
1471 1472
-- skipping U+02120 (service mark) U+02122 (trademark)
1473 1474
for
k
,
v
in
next
,
data
do
1475
local
specials
=
v
.
specials
1476
if
specials
then
1477
local
what
=
specials
[
1
]
1478
if
what
=
=
"
super
"
then
1479
if
#
specials
=
=
2
then
1480
superscripts
[
k
]
=
specials
[
2
]
1481
elseif
trace_defining
then
1482
report_defining
(
"
ignoring %s %a, char %c, description %a
"
,
"
superscript
"
,
ustring
(
k
)
,
k
,
v
.
description
)
1483
end
1484
elseif
what
=
=
"
sub
"
then
1485
if
#
specials
=
=
2
then
1486
subscripts
[
k
]
=
specials
[
2
]
1487
elseif
trace_defining
then
1488
report_defining
(
"
ignoring %s %a, char %c, description %a
"
,
"
subscript
"
,
ustring
(
k
)
,
k
,
v
.
description
)
1489
end
1490
elseif
what
=
=
"
fraction
"
then
1491
if
#
specials
>
1
then
1492
fractions
[
k
]
=
{
unpack
(
specials
,
2
)
}
1493
elseif
trace_defining
then
1494
report_defining
(
"
ignoring %s %a, char %c, description %a
"
,
"
fraction
"
,
ustring
(
k
)
,
k
,
v
.
description
)
1495
end
1496
end
1497
end
1498
end
1499 1500
-- print(table.serialize(superscripts, "superscripts", { hexify = true }))
1501
-- print(table.serialize(subscripts, "subscripts", { hexify = true }))
1502
-- print(table.serialize(fractions, "fractions", { hexify = true }))
1503 1504
if
storage
then
1505
storage
.
register
(
"
characters/superscripts
"
,
superscripts
,
"
characters.superscripts
"
)
1506
storage
.
register
(
"
characters/subscripts
"
,
subscripts
,
"
characters.subscripts
"
)
1507
storage
.
register
(
"
characters/fractions
"
,
fractions
,
"
characters.fractions
"
)
1508
end
1509 1510
end
1511 1512
function
characters
.
showstring
(
str
)
1513
local
list
=
utotable
(
str
)
1514
for
i
=
1
,
#
list
do
1515
report_defining
(
"
split % 3i : %C
"
,
i
,
list
[
i
]
)
1516
end
1517
end
1518 1519
do
1520 1521
-- There is no need to preload this table.
1522 1523
local
any
=
P
(
1
)
1524
local
special
=
S
(
[[
'".,:;-+()
]]
)
1525
+
P
(
'
'
)
+
P
(
'
'
)
1526
local
apostrofe
=
P
(
"
"
)
+
P
(
"
'
"
)
1527 1528
local
pattern
=
Cs
(
(
1529
(
P
(
"
medium light
"
)
/
"
medium-light
"
+
P
(
"
medium dark
"
)
/
"
medium-dark
"
)
*
P
(
"
skin tone
"
)
1530
+
(
apostrofe
*
P
(
"
s
"
)
)
/
"
"
1531
+
special
/
"
"
1532
+
any
1533
)
^
1
)
1534 1535
local
function
load
(
)
1536
local
name
=
resolvers
.
findfile
(
"
char-emj.lua
"
)
1537
local
data
=
name
and
name
~
=
"
"
and
dofile
(
name
)
or
{
}
1538
local
hash
=
{
}
1539
for
d
,
c
in
next
,
data
do
1540
local
k
=
lpegmatch
(
pattern
,
d
)
or
d
1541
local
u
=
{
}
1542
for
i
=
1
,
#
c
do
1543
u
[
i
]
=
utfchar
(
c
[
i
]
)
1544
end
1545
u
=
concat
(
u
)
1546
hash
[
k
]
=
u
1547
end
1548
return
data
,
hash
1549
end
1550 1551
local
data
,
hash
=
nil
,
nil
1552 1553
function
characters
.
emojized
(
name
)
1554
local
t
=
lpegmatch
(
pattern
,
name
)
1555
if
t
then
1556
return
t
1557
else
1558
return
{
name
}
1559
end
1560
end
1561 1562
local
start
=
P
(
"
"
)
1563
local
finish
=
P
(
-1
)
+
P
(
"
"
)
1564
local
skintone
=
P
(
"
medium
"
)
^
0
*
(
P
(
"
light
"
)
+
P
(
"
dark
"
)
)
^
0
*
P
(
"
skin tone
"
)
1565
local
gender
=
P
(
"
woman
"
)
+
P
(
"
man
"
)
1566
local
expanded
=
(
1567
P
(
"
m-l-
"
)
/
"
medium-light
"
1568
+
P
(
"
m-d-
"
)
/
"
medium-dark
"
1569
+
P
(
"
l-
"
)
/
"
light
"
1570
+
P
(
"
m-
"
)
/
"
medium
"
1571
+
P
(
"
d-
"
)
/
"
dark
"
1572
)
1573
*
(
P
(
"
s-t
"
)
/
"
skin tone
"
)
1574
local
compacted
=
(
1575
(
P
(
"
medium-
"
)
/
"
m-
"
*
(
P
(
"
light
"
)
/
"
l
"
+
P
(
"
dark
"
)
/
"
d
"
)
)
1576
+
(
P
(
"
medium
"
)
/
"
m
"
+
P
(
"
light
"
)
/
"
l
"
+
P
(
"
dark
"
)
/
"
d
"
)
1577
)
1578
*
(
P
(
"
skin tone
"
)
/
"
-s-t
"
)
1579 1580
local
pattern_0
=
Cs
(
(
expanded
+
any
)
^
1
)
1581
local
pattern_1
=
Cs
(
(
(
start
*
skintone
+
skintone
*
finish
)
/
"
"
+
any
)
^
1
)
1582
local
pattern_2
=
Cs
(
(
(
start
*
gender
+
gender
*
finish
)
/
"
"
+
any
)
^
1
)
1583
local
pattern_4
=
Cs
(
(
compacted
+
any
)
^
1
)
1584 1585
-- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t"))
1586
-- print(lpegmatch(pattern_0,"something m-l-s-t"))
1587
-- print(lpegmatch(pattern_0,"something m-s-t"))
1588
-- print(lpegmatch(pattern_4,"something medium-light skin tone"))
1589
-- print(lpegmatch(pattern_4,"something medium skin tone"))
1590 1591
local
skin
=
1592
P
(
"
light skin tone
"
)
/
utfchar
(
0x1F3FB
)
1593
+
P
(
"
medium-light skin tone
"
)
/
utfchar
(
0x1F3FC
)
1594
+
P
(
"
medium skin tone
"
)
/
utfchar
(
0x1F3FD
)
1595
+
P
(
"
medium-dark skin tone
"
)
/
utfchar
(
0x1F3FE
)
1596
+
P
(
"
dark skin tone
"
)
/
utfchar
(
0x1F3FF
)
1597 1598
local
parent
=
1599
P
(
"
man
"
)
/
utfchar
(
0x1F468
)
1600
+
P
(
"
woman
"
)
/
utfchar
(
0x1F469
)
1601 1602
local
child
=
1603
P
(
"
baby
"
)
/
utfchar
(
0x1F476
)
1604
+
P
(
"
boy
"
)
/
utfchar
(
0x1F466
)
1605
+
P
(
"
girl
"
)
/
utfchar
(
0x1F467
)
1606 1607
local
zwj
=
utfchar
(
0x200D
)
1608
local
heart
=
utfchar
(
0x2764
)
.
.
utfchar
(
0xFE0F
)
.
.
zwj
1609
local
kiss
=
utfchar
(
0x2764
)
.
.
utfchar
(
0xFE0F
)
.
.
utfchar
(
0x200D
)
.
.
utfchar
(
0x1F48B
)
.
.
zwj
1610 1611
----- member = parent + child
1612 1613
local
space
=
P
(
"
"
)
1614
local
final
=
P
(
-1
)
1615 1616
local
p_done
=
(
space
^
1
/
zwj
)
+
P
(
-1
)
1617
local
p_rest
=
space
/
"
"
*
(
skin
*
p_done
)
+
p_done
1618
local
p_parent
=
parent
*
p_rest
1619
local
p_child
=
child
*
p_rest
1620 1621
local
p_family
=
Cs
(
(
P
(
"
family
"
)
*
space
^
1
)
/
"
"
*
p_parent
^
-2
*
p_child
^
-2
)
1622
local
p_couple
=
Cs
(
(
P
(
"
couple with heart
"
)
*
space
^
1
)
/
"
"
*
p_parent
*
Cc
(
heart
)
*
p_parent
)
1623
local
p_kiss
=
Cs
(
(
P
(
"
kiss
"
)
*
space
^
1
)
/
"
"
*
p_parent
*
Cc
(
kiss
)
*
p_parent
)
1624 1625
local
p_special
=
p_family
+
p_couple
+
p_kiss
1626 1627
-- print(lpeg.match(p_special,"family man woman girl"))
1628
-- print(lpeg.match(p_special,"family man dark skin tone woman girl girl"))
1629 1630
-- local p_special = P { "all",
1631
-- all = Cs (V("family") + V("couple") + V("kiss")),
1632
-- family = C("family") * space^1 * V("parent")^-2 * V("child")^-2,
1633
-- couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"),
1634
-- kiss = P("kiss") * space^1 * V("parent") * Cc(kiss) * V("parent"),
1635
-- parent = parent * V("rest"),
1636
-- child = child * V("rest"),
1637
-- rest = (space * skin)^0/"" * ((space^1/zwj) + P(-1)),
1638
-- }
1639 1640
local
emoji
=
{
}
1641
characters
.
emoji
=
emoji
1642 1643
local
cache
=
setmetatable
(
{
}
,
{
__mode
=
"
k
"
}
)
1644 1645
function
emoji
.
resolve
(
name
)
1646
if
not
hash
then
1647
data
,
hash
=
load
(
)
1648
end
1649
local
h
=
hash
[
name
]
1650
if
h
then
1651
return
h
1652
end
1653
local
h
=
cache
[
name
]
1654
if
h
then
1655
return
h
1656
elseif
h
=
=
false
then
1657
return
1658
end
1659
-- expand shortcuts
1660
local
name
=
lpegmatch
(
pattern_0
,
name
)
or
name
1661
-- expand some 25K variants
1662
local
h
=
lpegmatch
(
p_special
,
name
)
1663
if
h
then
1664
cache
[
name
]
=
h
1665
return
h
1666
end
1667
-- simplify
1668
local
s
=
lpegmatch
(
pattern_1
,
name
)
1669
local
h
=
hash
[
s
]
1670
if
h
then
1671
cache
[
name
]
=
h
1672
return
h
1673
end
1674
-- simplify
1675
local
s
=
lpegmatch
(
pattern_2
,
name
)
1676
local
h
=
hash
[
s
]
1677
if
h
then
1678
cache
[
name
]
=
h
1679
return
h
1680
end
1681
cache
[
name
]
=
false
1682
end
1683 1684
function
emoji
.
known
(
)
1685
if
not
hash
then
1686
data
,
hash
=
load
(
)
1687
end
1688
return
hash
,
data
1689
end
1690 1691
function
emoji
.
compact
(
name
)
1692
return
lpegmatch
(
pattern_4
,
name
)
or
name
1693
end
1694 1695
end
1696 1697
-- code moved to char-tex.lua
1698 1699
return
characters
1700