char-ini.lua /size: 93 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['char-ini'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- todo: make two files, one for format generation, one for format use
10-- todo: move some to char-utf
11
12-- we can remove the tag range starting at 0xE0000 (special applications)
13
14local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
15local concat, unpack, tohash, insert = table.concat, table.unpack, table.tohash, table.insert
16local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
17local format, lower, gsub, find = string.format, string.lower, string.gsub, string.find
18local P, R, S, C, Cs, Ct, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.Cc, lpeg.V
19local formatters = string.formatters
20
21if not characters then require("char-def") end
22
23local lpegpatterns          = lpeg.patterns
24local lpegmatch             = lpeg.match
25local utf8byte              = lpegpatterns.utf8byte
26local utf8character         = lpegpatterns.utf8character
27
28local utfchartabletopattern = lpeg.utfchartabletopattern
29
30local allocate              = utilities.storage.allocate
31local mark                  = utilities.storage.mark
32
33local setmetatableindex     = table.setmetatableindex
34
35local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
36
37local report_defining       = logs.reporter("characters")
38
39--[[ldx--
40<p>This module implements some methods and creates additional datastructured
41from the big character table that we use for all kind of purposes:
42<type>char-def.lua</type>.</p>
43
44<p>We assume that at this point <type>characters.data</type> is already
45loaded!</p>
46--ldx]]--
47
48-- todo: in 'char-def.lua' assume defaults:
49--
50-- directions = l
51-- cjkwd      = a
52-- linebreak  = al
53
54characters       = characters or { }
55local characters = characters
56local data       = characters.data
57
58if data then
59    mark(data) -- why does this fail
60else
61    report_defining("fatal error: 'char-def.lua' is not loaded")
62    os.exit()
63end
64
65--[[ldx--
66Extending the table.
67--ldx]]--
68
69if context then
70
71    if not characters.private then
72
73        require("char-prv")
74
75        if storage then
76            storage.register("characters/private", characters.private, "characters.private")
77        end
78
79    end
80
81    for unicode, d in next, characters.private do
82        data[unicode] = d
83    end
84
85end
86
87--[[ldx--
88<p>This converts a string (if given) into a number.</p>
89--ldx]]--
90
91local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
92
93lpegpatterns.chartonumber = pattern
94
95local function chartonumber(k)
96    if type(k) == "string" then
97        local u = lpegmatch(pattern,k)
98        if u then
99            return utfbyte(u)
100        else
101            return utfbyte(k) or 0
102        end
103    else
104        return k or 0
105    end
106end
107
108local function charfromnumber(k)
109    if type(k) == "number" then
110        return utfchar(k) or ""
111    else
112        local u = lpegmatch(pattern,k)
113        if u then
114            return utfchar(u)
115        else
116            return k
117        end
118    end
119end
120
121--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
122
123characters.tonumber   = chartonumber
124characters.fromnumber = charfromnumber
125
126local private = {
127    description = "PRIVATE SLOT",
128}
129
130local ranges      = allocate()
131characters.ranges = ranges
132
133setmetatableindex(data, function(t,k)
134    local tk = type(k)
135    if tk == "string" then
136        k = lpegmatch(pattern,k) or utfbyte(k)
137        if k then
138            local v = rawget(t,k)
139            if v then
140                return v
141            else
142                tk = "number" -- fall through to range
143            end
144        else
145            return private
146        end
147    end
148    if tk == "number" and k < 0xF0000 then
149        for r=1,#ranges do
150            local rr = ranges[r]
151            if k >= rr.first and k <= rr.last then
152                local extender = rr.extender
153                if extender then
154                    local v = extender(k)
155                    t[k] = v
156                    return v
157                end
158            end
159        end
160    end
161    return private -- handy for when we loop over characters in fonts and check for a property
162end)
163
164local variant_selector_metatable = {
165    category  = "mn",
166    cjkwd     = "a",
167    direction = "nsm",
168    linebreak = "cm",
169}
170
171-- This saves a bit of memory and also serves as example.
172
173local f_variant = string.formatters["VARIATION SELECTOR-0x%04X"]
174
175insert(characters.ranges,{
176    first    = 0xFE00,
177    last     = 0xFE0F,
178    name     = "variant selector",
179    extender = function(k)
180        local t = {
181            description = f_variant(k - 0xFE00 + 0x0001),
182            unicodeslot = k,
183        }
184        setmetatable(t,variant_selector_metatable)
185        return t
186    end,
187})
188
189insert(characters.ranges,{
190    first    = 0xE0100,
191    last     = 0xE01EF,
192    name     = "variant selector extension",
193    extender = function(k)
194        local t = {
195            description = f_variant(k - 0xE0100 + 0x0011),
196            unicodeslot = k,
197        }
198        setmetatable(t,variant_selector_metatable)
199        return t
200    end,
201})
202
203local blocks = allocate {
204    ["adlam"]                                       = { first = 0x1E900, last = 0x1E95F,              description = "Adlam" },
205    ["aegeannumbers"]                               = { first = 0x10100, last = 0x1013F,              description = "Aegean Numbers" },
206    ["ahom"]                                        = { first = 0x11700, last = 0x1174F,              description = "Ahom" },
207    ["alchemicalsymbols"]                           = { first = 0x1F700, last = 0x1F77F,              description = "Alchemical Symbols" },
208    ["alphabeticpresentationforms"]                 = { first = 0x0FB00, last = 0x0FB4F, otf="latn",  description = "Alphabetic Presentation Forms" },
209    ["anatolianhieroglyphs"]                        = { first = 0x14400, last = 0x1467F,              description = "Anatolian Hieroglyphs" },
210    ["ancientgreekmusicalnotation"]                 = { first = 0x1D200, last = 0x1D24F, otf="grek",  description = "Ancient Greek Musical Notation" },
211    ["ancientgreeknumbers"]                         = { first = 0x10140, last = 0x1018F, otf="grek",  description = "Ancient Greek Numbers" },
212    ["ancientsymbols"]                              = { first = 0x10190, last = 0x101CF, otf="grek",  description = "Ancient Symbols" },
213    ["arabic"]                                      = { first = 0x00600, last = 0x006FF, otf="arab",  description = "Arabic" },
214    ["arabicextendeda"]                             = { first = 0x008A0, last = 0x008FF,              description = "Arabic Extended-A" },
215    ["arabicextendedb"]                             = { first = 0x00870, last = 0x0089F,              description = "Arabic Extended-B" },
216    ["arabicmathematicalalphabeticsymbols"]         = { first = 0x1EE00, last = 0x1EEFF,              description = "Arabic Mathematical Alphabetic Symbols" },
217    ["arabicpresentationformsa"]                    = { first = 0x0FB50, last = 0x0FDFF, otf="arab",  description = "Arabic Presentation Forms-A" },
218    ["arabicpresentationformsb"]                    = { first = 0x0FE70, last = 0x0FEFF, otf="arab",  description = "Arabic Presentation Forms-B" },
219    ["arabicsupplement"]                            = { first = 0x00750, last = 0x0077F, otf="arab",  description = "Arabic Supplement" },
220    ["armenian"]                                    = { first = 0x00530, last = 0x0058F, otf="armn",  description = "Armenian" },
221    ["arrows"]                                      = { first = 0x02190, last = 0x021FF,              description = "Arrows" },
222    ["avestan"]                                     = { first = 0x10B00, last = 0x10B3F,              description = "Avestan" },
223    ["balinese"]                                    = { first = 0x01B00, last = 0x01B7F, otf="bali",  description = "Balinese" },
224    ["bamum"]                                       = { first = 0x0A6A0, last = 0x0A6FF,              description = "Bamum" },
225    ["bamumsupplement"]                             = { first = 0x16800, last = 0x16A3F,              description = "Bamum Supplement" },
226    ["basiclatin"]                                  = { first = 0x00000, last = 0x0007F, otf="latn",  description = "Basic Latin" },
227    ["bassavah"]                                    = { first = 0x16AD0, last = 0x16AFF,              description = "Bassa Vah" },
228    ["batak"]                                       = { first = 0x01BC0, last = 0x01BFF,              description = "Batak" },
229    ["bengali"]                                     = { first = 0x00980, last = 0x009FF, otf="beng",  description = "Bengali" },
230    ["bhaiksuki"]                                   = { first = 0x11C00, last = 0x11C6F,              description = "Bhaiksuki" },
231    ["blockelements"]                               = { first = 0x02580, last = 0x0259F, otf="bopo",  description = "Block Elements" },
232    ["bopomofo"]                                    = { first = 0x03100, last = 0x0312F, otf="bopo",  description = "Bopomofo" },
233    ["bopomofoextended"]                            = { first = 0x031A0, last = 0x031BF, otf="bopo",  description = "Bopomofo Extended" },
234    ["boxdrawing"]                                  = { first = 0x02500, last = 0x0257F,              description = "Box Drawing" },
235    ["brahmi"]                                      = { first = 0x11000, last = 0x1107F,              description = "Brahmi" },
236    ["braillepatterns"]                             = { first = 0x02800, last = 0x028FF, otf="brai",  description = "Braille Patterns" },
237    ["buginese"]                                    = { first = 0x01A00, last = 0x01A1F, otf="bugi",  description = "Buginese" },
238    ["buhid"]                                       = { first = 0x01740, last = 0x0175F, otf="buhd",  description = "Buhid" },
239    ["byzantinemusicalsymbols"]                     = { first = 0x1D000, last = 0x1D0FF, otf="byzm",  description = "Byzantine Musical Symbols" },
240    ["carian"]                                      = { first = 0x102A0, last = 0x102DF,              description = "Carian" },
241    ["caucasianalbanian"]                           = { first = 0x10530, last = 0x1056F,              description = "Caucasian Albanian" },
242    ["chakma"]                                      = { first = 0x11100, last = 0x1114F,              description = "Chakma" },
243    ["cham"]                                        = { first = 0x0AA00, last = 0x0AA5F,              description = "Cham" },
244    ["cherokee"]                                    = { first = 0x013A0, last = 0x013FF, otf="cher",  description = "Cherokee" },
245    ["cherokeesupplement"]                          = { first = 0x0AB70, last = 0x0ABBF,              description = "Cherokee Supplement" },
246    ["chesssymbols"]                                = { first = 0x1FA00, last = 0x1FA6F,              description = "Chess Symbols" },
247    ["chorasmian"]                                  = { first = 0x10FB0, last = 0x10FDF,              description = "Chorasmian" },
248    ["cjkcompatibility"]                            = { first = 0x03300, last = 0x033FF, otf="hang",  description = "CJK Compatibility" },
249    ["cjkcompatibilityforms"]                       = { first = 0x0FE30, last = 0x0FE4F, otf="hang",  description = "CJK Compatibility Forms" },
250    ["cjkcompatibilityideographs"]                  = { first = 0x0F900, last = 0x0FAFF, otf="hang",  description = "CJK Compatibility Ideographs" },
251    ["cjkcompatibilityideographssupplement"]        = { first = 0x2F800, last = 0x2FA1F, otf="hang",  description = "CJK Compatibility Ideographs Supplement" },
252    ["cjkradicalssupplement"]                       = { first = 0x02E80, last = 0x02EFF, otf="hang",  description = "CJK Radicals Supplement" },
253    ["cjkstrokes"]                                  = { first = 0x031C0, last = 0x031EF, otf="hang",  description = "CJK Strokes" },
254    ["cjksymbolsandpunctuation"]                    = { first = 0x03000, last = 0x0303F, otf="hang",  description = "CJK Symbols and Punctuation" },
255    ["cjkunifiedideographs"]                        = { first = 0x04E00, last = 0x09FFF, otf="hang",  description = "CJK Unified Ideographs", catcode = "letter" },
256    ["cjkunifiedideographsextensiona"]              = { first = 0x03400, last = 0x04DBF, otf="hang",  description = "CJK Unified Ideographs Extension A" },
257    ["cjkunifiedideographsextensionb"]              = { first = 0x20000, last = 0x2A6DF, otf="hang",  description = "CJK Unified Ideographs Extension B" },
258    ["cjkunifiedideographsextensionc"]              = { first = 0x2A700, last = 0x2B73F,              description = "CJK Unified Ideographs Extension C" },
259    ["cjkunifiedideographsextensiond"]              = { first = 0x2B740, last = 0x2B81F,              description = "CJK Unified Ideographs Extension D" },
260    ["cjkunifiedideographsextensione"]              = { first = 0x2B820, last = 0x2CEAF,              description = "CJK Unified Ideographs Extension E" },
261    ["cjkunifiedideographsextensionf"]              = { first = 0x2CEB0, last = 0x2EBEF,              description = "CJK Unified Ideographs Extension F" },
262    ["cjkunifiedideographsextensiong"]              = { first = 0x30000, last = 0x3134F,              description = "CJK Unified Ideographs Extension G" },
263    ["combiningdiacriticalmarks"]                   = { first = 0x00300, last = 0x0036F,              description = "Combining Diacritical Marks" },
264    ["combiningdiacriticalmarksextended"]           = { first = 0x01AB0, last = 0x01AFF,              description = "Combining Diacritical Marks Extended" },
265    ["combiningdiacriticalmarksforsymbols"]         = { first = 0x020D0, last = 0x020FF,              description = "Combining Diacritical Marks for Symbols" },
266    ["combiningdiacriticalmarkssupplement"]         = { first = 0x01DC0, last = 0x01DFF,              description = "Combining Diacritical Marks Supplement" },
267    ["combininghalfmarks"]                          = { first = 0x0FE20, last = 0x0FE2F,              description = "Combining Half Marks" },
268    ["commonindicnumberforms"]                      = { first = 0x0A830, last = 0x0A83F,              description = "Common Indic Number Forms" },
269    ["controlpictures"]                             = { first = 0x02400, last = 0x0243F,              description = "Control Pictures" },
270    ["coptic"]                                      = { first = 0x02C80, last = 0x02CFF, otf="copt",  description = "Coptic" },
271    ["copticepactnumbers"]                          = { first = 0x102E0, last = 0x102FF,              description = "Coptic Epact Numbers" },
272    ["countingrodnumerals"]                         = { first = 0x1D360, last = 0x1D37F,              description = "Counting Rod Numerals" },
273    ["cuneiform"]                                   = { first = 0x12000, last = 0x123FF, otf="xsux",  description = "Cuneiform" },
274    ["cuneiformnumbersandpunctuation"]              = { first = 0x12400, last = 0x1247F, otf="xsux",  description = "Cuneiform Numbers and Punctuation" },
275    ["currencysymbols"]                             = { first = 0x020A0, last = 0x020CF,              description = "Currency Symbols" },
276    ["cypriotsyllabary"]                            = { first = 0x10800, last = 0x1083F, otf="cprt",  description = "Cypriot Syllabary" },
277    ["cyprominoan"]                                 = { first = 0x12F90, last = 0x12FFF,              description = "Cypro-Minoan" },
278    ["cyrillic"]                                    = { first = 0x00400, last = 0x004FF, otf="cyrl",  description = "Cyrillic" },
279    ["cyrillicextendeda"]                           = { first = 0x02DE0, last = 0x02DFF, otf="cyrl",  description = "Cyrillic Extended-A" },
280    ["cyrillicextendedb"]                           = { first = 0x0A640, last = 0x0A69F, otf="cyrl",  description = "Cyrillic Extended-B" },
281    ["cyrillicextendedc"]                           = { first = 0x01C80, last = 0x01C8F,              description = "Cyrillic Extended-C" },
282    ["cyrillicsupplement"]                          = { first = 0x00500, last = 0x0052F, otf="cyrl",  description = "Cyrillic Supplement" },
283    ["deseret"]                                     = { first = 0x10400, last = 0x1044F, otf="dsrt",  description = "Deseret" },
284    ["devanagari"]                                  = { first = 0x00900, last = 0x0097F, otf="deva",  description = "Devanagari" },
285    ["devanagariextended"]                          = { first = 0x0A8E0, last = 0x0A8FF,              description = "Devanagari Extended" },
286    ["digitsarabicindic"]                           = { first = 0x00660, last = 0x00669, math = true },
287 -- ["digitsbengali"]                               = { first = 0x009E6, last = 0x009EF, math = true },
288    ["digitsbold"]                                  = { first = 0x1D7CE, last = 0x1D7D8, math = true },
289 -- ["digitsdevanagari"]                            = { first = 0x00966, last = 0x0096F, math = true },
290    ["digitsdoublestruck"]                          = { first = 0x1D7D8, last = 0x1D7E2, math = true },
291 -- ["digitsethiopic"]                              = { first = 0x01369, last = 0x01371, math = true },
292    ["digitsextendedarabicindic"]                   = { first = 0x006F0, last = 0x006F9, math = true },
293 -- ["digitsgujarati"]                              = { first = 0x00AE6, last = 0x00AEF, math = true },
294 -- ["digitsgurmukhi"]                              = { first = 0x00A66, last = 0x00A6F, math = true },
295 -- ["digitskannada"]                               = { first = 0x00CE6, last = 0x00CEF, math = true },
296 -- ["digitskhmer"]                                 = { first = 0x017E0, last = 0x017E9, math = true },
297 -- ["digitslao"]                                   = { first = 0x00ED0, last = 0x00ED9, math = true },
298    ["digitslatin"]                                 = { first = 0x00030, last = 0x00039, math = true },
299 -- ["digitsmalayalam"]                             = { first = 0x00D66, last = 0x00D6F, math = true },
300 -- ["digitsmongolian"]                             = { first = 0x01810, last = 0x01809, math = true },
301    ["digitsmonospace"]                             = { first = 0x1D7F6, last = 0x1D80F, math = true },
302 -- ["digitsmyanmar"]                               = { first = 0x01040, last = 0x01049, math = true },
303    ["digitsnormal"]                                = { first = 0x00030, last = 0x00039, math = true },
304 -- ["digitsoriya"]                                 = { first = 0x00B66, last = 0x00B6F, math = true },
305    ["digitssansserifbold"]                         = { first = 0x1D7EC, last = 0x1D805, math = true },
306    ["digitssansserifnormal"]                       = { first = 0x1D7E2, last = 0x1D7EC, math = true },
307 -- ["digitstamil"]                                 = { first = 0x00030, last = 0x00039, math = true }, -- no zero
308 -- ["digitstelugu"]                                = { first = 0x00C66, last = 0x00C6F, math = true },
309 -- ["digitsthai"]                                  = { first = 0x00E50, last = 0x00E59, math = true },
310 -- ["digitstibetan"]                               = { first = 0x00F20, last = 0x00F29, math = true },
311    ["dingbats"]                                    = { first = 0x02700, last = 0x027BF,              description = "Dingbats" },
312    ["divesakuru"]                                  = { first = 0x11900, last = 0x1195F,              description = "Dives Akuru" },
313    ["dogra"]                                       = { first = 0x11800, last = 0x1184F,              description = "Dogra" },
314    ["dominotiles"]                                 = { first = 0x1F030, last = 0x1F09F,              description = "Domino Tiles" },
315    ["duployan"]                                    = { first = 0x1BC00, last = 0x1BC9F,              description = "Duployan" },
316    ["earlydynasticcuneiform"]                      = { first = 0x12480, last = 0x1254F,              description = "Early Dynastic Cuneiform" },
317    ["egyptianhieroglyphformatcontrols"]            = { first = 0x13430, last = 0x1343F,              description = "Egyptian Hieroglyph Format Controls" },
318    ["egyptianhieroglyphs"]                         = { first = 0x13000, last = 0x1342F,              description = "Egyptian Hieroglyphs" },
319    ["elbasan"]                                     = { first = 0x10500, last = 0x1052F,              description = "Elbasan" },
320    ["elymaic"]                                     = { first = 0x10FE0, last = 0x10FFF,              description = "Elymaic" },
321    ["emoticons"]                                   = { first = 0x1F600, last = 0x1F64F,              description = "Emoticons" },
322    ["enclosedalphanumerics"]                       = { first = 0x02460, last = 0x024FF,              description = "Enclosed Alphanumerics" },
323    ["enclosedalphanumericsupplement"]              = { first = 0x1F100, last = 0x1F1FF,              description = "Enclosed Alphanumeric Supplement" },
324    ["enclosedcjklettersandmonths"]                 = { first = 0x03200, last = 0x032FF,              description = "Enclosed CJK Letters and Months" },
325    ["enclosedideographicsupplement"]               = { first = 0x1F200, last = 0x1F2FF,              description = "Enclosed Ideographic Supplement" },
326    ["ethiopic"]                                    = { first = 0x01200, last = 0x0137F, otf="ethi",  description = "Ethiopic" },
327    ["ethiopicextended"]                            = { first = 0x02D80, last = 0x02DDF, otf="ethi",  description = "Ethiopic Extended" },
328    ["ethiopicextendeda"]                           = { first = 0x0AB00, last = 0x0AB2F,              description = "Ethiopic Extended-A" },
329    ["ethiopicextendedb"]                           = { first = 0x1E7E0, last = 0x1E7FF,              description = "Ethiopic Extended-B" },
330    ["ethiopicsupplement"]                          = { first = 0x01380, last = 0x0139F, otf="ethi",  description = "Ethiopic Supplement" },
331    ["generalpunctuation"]                          = { first = 0x02000, last = 0x0206F,              description = "General Punctuation" },
332    ["geometricshapes"]                             = { first = 0x025A0, last = 0x025FF, math = true, description = "Geometric Shapes" },
333    ["geometricshapesextended"]                     = { first = 0x1F780, last = 0x1F7FF,              description = "Geometric Shapes Extended" },
334    ["georgian"]                                    = { first = 0x010A0, last = 0x010FF, otf="geor",  description = "Georgian" },
335    ["georgianextended"]                            = { first = 0x01C90, last = 0x01CBF,              description = "Georgian Extended" },
336    ["georgiansupplement"]                          = { first = 0x02D00, last = 0x02D2F, otf="geor",  description = "Georgian Supplement" },
337    ["glagolitic"]                                  = { first = 0x02C00, last = 0x02C5F, otf="glag",  description = "Glagolitic" },
338    ["glagoliticsupplement"]                        = { first = 0x1E000, last = 0x1E02F,              description = "Glagolitic Supplement" },
339    ["gothic"]                                      = { first = 0x10330, last = 0x1034F, otf="goth",  description = "Gothic" },
340    ["grantha"]                                     = { first = 0x11300, last = 0x1137F,              description = "Grantha" },
341    ["greekandcoptic"]                              = { first = 0x00370, last = 0x003FF, otf="grek",  description = "Greek and Coptic" },
342    ["greekextended"]                               = { first = 0x01F00, last = 0x01FFF, otf="grek",  description = "Greek Extended" },
343    ["gujarati"]                                    = { first = 0x00A80, last = 0x00AFF, otf="gujr",  description = "Gujarati" },
344    ["gunjalagondi"]                                = { first = 0x11D60, last = 0x11DAF,              description = "Gunjala Gondi" },
345    ["gurmukhi"]                                    = { first = 0x00A00, last = 0x00A7F, otf="guru",  description = "Gurmukhi" },
346    ["halfwidthandfullwidthforms"]                  = { first = 0x0FF00, last = 0x0FFEF,              description = "Halfwidth and Fullwidth Forms" },
347    ["hangulcompatibilityjamo"]                     = { first = 0x03130, last = 0x0318F, otf="jamo",  description = "Hangul Compatibility Jamo" },
348    ["hanguljamo"]                                  = { first = 0x01100, last = 0x011FF, otf="jamo",  description = "Hangul Jamo" },
349    ["hanguljamoextendeda"]                         = { first = 0x0A960, last = 0x0A97F,              description = "Hangul Jamo Extended-A" },
350    ["hanguljamoextendedb"]                         = { first = 0x0D7B0, last = 0x0D7FF,              description = "Hangul Jamo Extended-B" },
351    ["hangulsyllables"]                             = { first = 0x0AC00, last = 0x0D7AF, otf="hang",  description = "Hangul Syllables" },
352    ["hanifirohingya"]                              = { first = 0x10D00, last = 0x10D3F,              description = "Hanifi Rohingya" },
353    ["hanunoo"]                                     = { first = 0x01720, last = 0x0173F, otf="hano",  description = "Hanunoo" },
354    ["hatran"]                                      = { first = 0x108E0, last = 0x108FF,              description = "Hatran" },
355    ["hebrew"]                                      = { first = 0x00590, last = 0x005FF, otf="hebr",  description = "Hebrew" },
356    ["highprivateusesurrogates"]                    = { first = 0x0DB80, last = 0x0DBFF,              description = "High Private Use Surrogates" },
357    ["highsurrogates"]                              = { first = 0x0D800, last = 0x0DB7F,              description = "High Surrogates" },
358    ["hiragana"]                                    = { first = 0x03040, last = 0x0309F, otf="kana",  description = "Hiragana" },
359    ["ideographicdescriptioncharacters"]            = { first = 0x02FF0, last = 0x02FFF,              description = "Ideographic Description Characters" },
360    ["ideographicsymbolsandpunctuation"]            = { first = 0x16FE0, last = 0x16FFF,              description = "Ideographic Symbols and Punctuation" },
361    ["imperialaramaic"]                             = { first = 0x10840, last = 0x1085F,              description = "Imperial Aramaic" },
362    ["indicsiyaqnumbers"]                           = { first = 0x1EC70, last = 0x1ECBF,              description = "Indic Siyaq Numbers" },
363    ["inscriptionalpahlavi"]                        = { first = 0x10B60, last = 0x10B7F,              description = "Inscriptional Pahlavi" },
364    ["inscriptionalparthian"]                       = { first = 0x10B40, last = 0x10B5F,              description = "Inscriptional Parthian" },
365    ["ipaextensions"]                               = { first = 0x00250, last = 0x002AF,              description = "IPA Extensions" },
366    ["javanese"]                                    = { first = 0x0A980, last = 0x0A9DF,              description = "Javanese" },
367    ["kaithi"]                                      = { first = 0x11080, last = 0x110CF,              description = "Kaithi" },
368    ["kanaextendeda"]                               = { first = 0x1B100, last = 0x1B12F,              description = "Kana Extended-A" },
369    ["kanaextendedb"]                               = { first = 0x1AFF0, last = 0x1AFFF,              description = "Kana Extended-B" },
370    ["kanasupplement"]                              = { first = 0x1B000, last = 0x1B0FF,              description = "Kana Supplement" },
371    ["kanbun"]                                      = { first = 0x03190, last = 0x0319F,              description = "Kanbun" },
372    ["kangxiradicals"]                              = { first = 0x02F00, last = 0x02FDF,              description = "Kangxi Radicals" },
373    ["kannada"]                                     = { first = 0x00C80, last = 0x00CFF, otf="knda",  description = "Kannada" },
374    ["katakana"]                                    = { first = 0x030A0, last = 0x030FF, otf="kana",  description = "Katakana" },
375    ["katakanaphoneticextensions"]                  = { first = 0x031F0, last = 0x031FF, otf="kana",  description = "Katakana Phonetic Extensions" },
376    ["kayahli"]                                     = { first = 0x0A900, last = 0x0A92F,              description = "Kayah Li" },
377    ["kharoshthi"]                                  = { first = 0x10A00, last = 0x10A5F, otf="khar",  description = "Kharoshthi" },
378    ["khitansmallscript"]                           = { first = 0x18B00, last = 0x18CFF,              description = "Khitan Small Script" },
379    ["khmer"]                                       = { first = 0x01780, last = 0x017FF, otf="khmr",  description = "Khmer" },
380    ["khmersymbols"]                                = { first = 0x019E0, last = 0x019FF, otf="khmr",  description = "Khmer Symbols" },
381    ["khojki"]                                      = { first = 0x11200, last = 0x1124F,              description = "Khojki" },
382    ["khudawadi"]                                   = { first = 0x112B0, last = 0x112FF,              description = "Khudawadi" },
383    ["lao"]                                         = { first = 0x00E80, last = 0x00EFF, otf="lao",   description = "Lao" },
384    ["latinextendeda"]                              = { first = 0x00100, last = 0x0017F, otf="latn",  description = "Latin Extended-A" },
385    ["latinextendedadditional"]                     = { first = 0x01E00, last = 0x01EFF, otf="latn",  description = "Latin Extended Additional" },
386    ["latinextendedb"]                              = { first = 0x00180, last = 0x0024F, otf="latn",  description = "Latin Extended-B" },
387    ["latinextendedc"]                              = { first = 0x02C60, last = 0x02C7F, otf="latn",  description = "Latin Extended-C" },
388    ["latinextendedd"]                              = { first = 0x0A720, last = 0x0A7FF, otf="latn",  description = "Latin Extended-D" },
389    ["latinextendede"]                              = { first = 0x0AB30, last = 0x0AB6F,              description = "Latin Extended-E" },
390    ["latinextendedf"]                              = { first = 0x10780, last = 0x107BF,              description = "Latin Extended-F" },
391    ["latinextendedg"]                              = { first = 0x1DF00, last = 0x1DFFF,              description = "Latin Extended-G" },
392    ["latinsupplement"]                             = { first = 0x00080, last = 0x000FF, otf="latn",  description = "Latin-1 Supplement" },
393    ["lepcha"]                                      = { first = 0x01C00, last = 0x01C4F,              description = "Lepcha" },
394    ["letterlikesymbols"]                           = { first = 0x02100, last = 0x0214F, math = true, description = "Letterlike Symbols" },
395    ["limbu"]                                       = { first = 0x01900, last = 0x0194F, otf="limb",  description = "Limbu" },
396    ["lineara"]                                     = { first = 0x10600, last = 0x1077F,              description = "Linear A" },
397    ["linearbideograms"]                            = { first = 0x10080, last = 0x100FF, otf="linb",  description = "Linear B Ideograms" },
398    ["linearbsyllabary"]                            = { first = 0x10000, last = 0x1007F, otf="linb",  description = "Linear B Syllabary" },
399    ["lisu"]                                        = { first = 0x0A4D0, last = 0x0A4FF,              description = "Lisu" },
400    ["lisusupplement"]                              = { first = 0x11FB0, last = 0x11FBF,              description = "Lisu Supplement" },
401    ["lowercasebold"]                               = { first = 0x1D41A, last = 0x1D433, math = true },
402    ["lowercaseboldfraktur"]                        = { first = 0x1D586, last = 0x1D59F, math = true },
403    ["lowercasebolditalic"]                         = { first = 0x1D482, last = 0x1D49B, math = true },
404    ["lowercaseboldscript"]                         = { first = 0x1D4EA, last = 0x1D503, math = true },
405    ["lowercasedoublestruck"]                       = { first = 0x1D552, last = 0x1D56B, math = true },
406    ["lowercasefraktur"]                            = { first = 0x1D51E, last = 0x1D537, math = true },
407    ["lowercasegreekbold"]                          = { first = 0x1D6C2, last = 0x1D6DB, math = true },
408    ["lowercasegreekbolditalic"]                    = { first = 0x1D736, last = 0x1D74F, math = true },
409    ["lowercasegreekitalic"]                        = { first = 0x1D6FC, last = 0x1D715, math = true },
410    ["lowercasegreeknormal"]                        = { first = 0x003B1, last = 0x003CA, math = true },
411    ["lowercasegreeksansserifbold"]                 = { first = 0x1D770, last = 0x1D789, math = true },
412    ["lowercasegreeksansserifbolditalic"]           = { first = 0x1D7AA, last = 0x1D7C3, math = true },
413    ["lowercaseitalic"]                             = { first = 0x1D44E, last = 0x1D467, math = true },
414    ["lowercasemonospace"]                          = { first = 0x1D68A, last = 0x1D6A3, math = true },
415    ["lowercasenormal"]                             = { first = 0x00061, last = 0x0007A, math = true },
416    ["lowercasesansserifbold"]                      = { first = 0x1D5EE, last = 0x1D607, math = true },
417    ["lowercasesansserifbolditalic"]                = { first = 0x1D656, last = 0x1D66F, math = true },
418    ["lowercasesansserifitalic"]                    = { first = 0x1D622, last = 0x1D63B, math = true },
419    ["lowercasesansserifnormal"]                    = { first = 0x1D5BA, last = 0x1D5D3, math = true },
420    ["lowercasescript"]                             = { first = 0x1D4B6, last = 0x1D4CF, math = true },
421    ["lowsurrogates"]                               = { first = 0x0DC00, last = 0x0DFFF,              description = "Low Surrogates" },
422    ["lycian"]                                      = { first = 0x10280, last = 0x1029F,              description = "Lycian" },
423    ["lydian"]                                      = { first = 0x10920, last = 0x1093F,              description = "Lydian" },
424    ["mahajani"]                                    = { first = 0x11150, last = 0x1117F,              description = "Mahajani" },
425    ["mahjongtiles"]                                = { first = 0x1F000, last = 0x1F02F,              description = "Mahjong Tiles" },
426    ["makasar"]                                     = { first = 0x11EE0, last = 0x11EFF,              description = "Makasar" },
427    ["malayalam"]                                   = { first = 0x00D00, last = 0x00D7F, otf="mlym",  description = "Malayalam" },
428    ["mandaic"]                                     = { first = 0x00840, last = 0x0085F, otf="mand",  description = "Mandaic" },
429    ["manichaean"]                                  = { first = 0x10AC0, last = 0x10AFF,              description = "Manichaean" },
430    ["marchen"]                                     = { first = 0x11C70, last = 0x11CBF,              description = "Marchen" },
431    ["masaramgondi"]                                = { first = 0x11D00, last = 0x11D5F,              description = "Masaram Gondi" },
432    ["mathematicalalphanumericsymbols"]             = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" },
433    ["mathematicaloperators"]                       = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" },
434    ["mayannumerals"]                               = { first = 0x1D2E0, last = 0x1D2FF,              description = "Mayan Numerals" },
435    ["medefaidrin"]                                 = { first = 0x16E40, last = 0x16E9F,              description = "Medefaidrin" },
436    ["meeteimayek"]                                 = { first = 0x0ABC0, last = 0x0ABFF,              description = "Meetei Mayek" },
437    ["meeteimayekextensions"]                       = { first = 0x0AAE0, last = 0x0AAFF,              description = "Meetei Mayek Extensions" },
438    ["mendekikakui"]                                = { first = 0x1E800, last = 0x1E8DF,              description = "Mende Kikakui" },
439    ["meroiticcursive"]                             = { first = 0x109A0, last = 0x109FF,              description = "Meroitic Cursive" },
440    ["meroitichieroglyphs"]                         = { first = 0x10980, last = 0x1099F,              description = "Meroitic Hieroglyphs" },
441    ["miao"]                                        = { first = 0x16F00, last = 0x16F9F,              description = "Miao" },
442    ["miscellaneousmathematicalsymbolsa"]           = { first = 0x027C0, last = 0x027EF, math = true, description = "Miscellaneous Mathematical Symbols-A" },
443    ["miscellaneousmathematicalsymbolsb"]           = { first = 0x02980, last = 0x029FF, math = true, description = "Miscellaneous Mathematical Symbols-B" },
444    ["miscellaneoussymbols"]                        = { first = 0x02600, last = 0x026FF, math = true, description = "Miscellaneous Symbols" },
445    ["miscellaneoussymbolsandarrows"]               = { first = 0x02B00, last = 0x02BFF, math = true, description = "Miscellaneous Symbols and Arrows" },
446    ["miscellaneoussymbolsandpictographs"]          = { first = 0x1F300, last = 0x1F5FF,              description = "Miscellaneous Symbols and Pictographs" },
447    ["miscellaneoustechnical"]                      = { first = 0x02300, last = 0x023FF, math = true, description = "Miscellaneous Technical" },
448    ["modi"]                                        = { first = 0x11600, last = 0x1165F,              description = "Modi" },
449    ["modifiertoneletters"]                         = { first = 0x0A700, last = 0x0A71F,              description = "Modifier Tone Letters" },
450    ["mongolian"]                                   = { first = 0x01800, last = 0x018AF, otf="mong",  description = "Mongolian" },
451    ["mongoliansupplement"]                         = { first = 0x11660, last = 0x1167F,              description = "Mongolian Supplement" },
452    ["mro"]                                         = { first = 0x16A40, last = 0x16A6F,              description = "Mro" },
453    ["multani"]                                     = { first = 0x11280, last = 0x112AF,              description = "Multani" },
454    ["musicalsymbols"]                              = { first = 0x1D100, last = 0x1D1FF, otf="musc",  description = "Musical Symbols" },
455    ["myanmar"]                                     = { first = 0x01000, last = 0x0109F, otf="mymr",  description = "Myanmar" },
456    ["myanmarextendeda"]                            = { first = 0x0AA60, last = 0x0AA7F,              description = "Myanmar Extended-A" },
457    ["myanmarextendedb"]                            = { first = 0x0A9E0, last = 0x0A9FF,              description = "Myanmar Extended-B" },
458    ["nabataean"]                                   = { first = 0x10880, last = 0x108AF,              description = "Nabataean" },
459    ["nandinagari"]                                 = { first = 0x119A0, last = 0x119FF,              description = "Nandinagari" },
460    ["newa"]                                        = { first = 0x11400, last = 0x1147F,              description = "Newa" },
461    ["newtailue"]                                   = { first = 0x01980, last = 0x019DF,              description = "New Tai Lue" },
462    ["nko"]                                         = { first = 0x007C0, last = 0x007FF, otf="nko",   description = "NKo" },
463    ["numberforms"]                                 = { first = 0x02150, last = 0x0218F,              description = "Number Forms" },
464    ["nushu"]                                       = { first = 0x1B170, last = 0x1B2FF,              description = "Nushu" },
465    ["nyiakengpuachuehmong"]                        = { first = 0x1E100, last = 0x1E14F,              description = "Nyiakeng Puachue Hmong" },
466    ["ogham"]                                       = { first = 0x01680, last = 0x0169F, otf="ogam",  description = "Ogham" },
467    ["olchiki"]                                     = { first = 0x01C50, last = 0x01C7F,              description = "Ol Chiki" },
468    ["oldhungarian"]                                = { first = 0x10C80, last = 0x10CFF,              description = "Old Hungarian" },
469    ["olditalic"]                                   = { first = 0x10300, last = 0x1032F, otf="ital",  description = "Old Italic" },
470    ["oldnortharabian"]                             = { first = 0x10A80, last = 0x10A9F,              description = "Old North Arabian" },
471    ["oldpermic"]                                   = { first = 0x10350, last = 0x1037F,              description = "Old Permic" },
472    ["oldpersian"]                                  = { first = 0x103A0, last = 0x103DF, otf="xpeo",  description = "Old Persian" },
473    ["oldsogdian"]                                  = { first = 0x10F00, last = 0x10F2F,              description = "Old Sogdian" },
474    ["oldsoutharabian"]                             = { first = 0x10A60, last = 0x10A7F,              description = "Old South Arabian" },
475    ["oldturkic"]                                   = { first = 0x10C00, last = 0x10C4F,              description = "Old Turkic" },
476    ["olduyghur"]                                   = { first = 0x10F70, last = 0x10FAF,              description = "Old Uyghur" },
477    ["opticalcharacterrecognition"]                 = { first = 0x02440, last = 0x0245F,              description = "Optical Character Recognition" },
478    ["oriya"]                                       = { first = 0x00B00, last = 0x00B7F, otf="orya",  description = "Oriya" },
479    ["ornamentaldingbats"]                          = { first = 0x1F650, last = 0x1F67F,              description = "Ornamental Dingbats" },
480    ["osage"]                                       = { first = 0x104B0, last = 0x104FF,              description = "Osage" },
481    ["osmanya"]                                     = { first = 0x10480, last = 0x104AF, otf="osma",  description = "Osmanya" },
482    ["ottomansiyaqnumbers"]                         = { first = 0x1ED00, last = 0x1ED4F,              description = "Ottoman Siyaq Numbers" },
483    ["pahawhhmong"]                                 = { first = 0x16B00, last = 0x16B8F,              description = "Pahawh Hmong" },
484    ["palmyrene"]                                   = { first = 0x10860, last = 0x1087F,              description = "Palmyrene" },
485    ["paucinhau"]                                   = { first = 0x11AC0, last = 0x11AFF,              description = "Pau Cin Hau" },
486    ["phagspa"]                                     = { first = 0x0A840, last = 0x0A87F, otf="phag",  description = "Phags-pa" },
487    ["phaistosdisc"]                                = { first = 0x101D0, last = 0x101FF,              description = "Phaistos Disc" },
488    ["phoenician"]                                  = { first = 0x10900, last = 0x1091F, otf="phnx",  description = "Phoenician" },
489    ["phoneticextensions"]                          = { first = 0x01D00, last = 0x01D7F,              description = "Phonetic Extensions" },
490    ["phoneticextensionssupplement"]                = { first = 0x01D80, last = 0x01DBF,              description = "Phonetic Extensions Supplement" },
491    ["playingcards"]                                = { first = 0x1F0A0, last = 0x1F0FF,              description = "Playing Cards" },
492    ["privateusearea"]                              = { first = 0x0E000, last = 0x0F8FF,              description = "Private Use Area" },
493    ["psalterpahlavi"]                              = { first = 0x10B80, last = 0x10BAF,              description = "Psalter Pahlavi" },
494    ["rejang"]                                      = { first = 0x0A930, last = 0x0A95F,              description = "Rejang" },
495    ["ruminumeralsymbols"]                          = { first = 0x10E60, last = 0x10E7F,              description = "Rumi Numeral Symbols" },
496    ["runic"]                                       = { first = 0x016A0, last = 0x016FF, otf="runr",  description = "Runic" },
497    ["samaritan"]                                   = { first = 0x00800, last = 0x0083F,              description = "Samaritan" },
498    ["saurashtra"]                                  = { first = 0x0A880, last = 0x0A8DF,              description = "Saurashtra" },
499    ["sharada"]                                     = { first = 0x11180, last = 0x111DF,              description = "Sharada" },
500    ["shavian"]                                     = { first = 0x10450, last = 0x1047F, otf="shaw",  description = "Shavian" },
501    ["shorthandformatcontrols"]                     = { first = 0x1BCA0, last = 0x1BCAF,              description = "Shorthand Format Controls" },
502    ["siddham"]                                     = { first = 0x11580, last = 0x115FF,              description = "Siddham" },
503    ["sinhala"]                                     = { first = 0x00D80, last = 0x00DFF, otf="sinh",  description = "Sinhala" },
504    ["sinhalaarchaicnumbers"]                       = { first = 0x111E0, last = 0x111FF,              description = "Sinhala Archaic Numbers" },
505    ["smallformvariants"]                           = { first = 0x0FE50, last = 0x0FE6F,              description = "Small Form Variants" },
506    ["smallkanaextension"]                          = { first = 0x1B130, last = 0x1B16F,              description = "Small Kana Extension" },
507    ["sogdian"]                                     = { first = 0x10F30, last = 0x10F6F,              description = "Sogdian" },
508    ["sorasompeng"]                                 = { first = 0x110D0, last = 0x110FF,              description = "Sora Sompeng" },
509    ["soyombo"]                                     = { first = 0x11A50, last = 0x11AAF,              description = "Soyombo" },
510    ["spacingmodifierletters"]                      = { first = 0x002B0, last = 0x002FF,              description = "Spacing Modifier Letters" },
511    ["specials"]                                    = { first = 0x0FFF0, last = 0x0FFFF,              description = "Specials" },
512    ["sundanese"]                                   = { first = 0x01B80, last = 0x01BBF,              description = "Sundanese" },
513    ["sundanesesupplement"]                         = { first = 0x01CC0, last = 0x01CCF,              description = "Sundanese Supplement" },
514    ["superscriptsandsubscripts"]                   = { first = 0x02070, last = 0x0209F,              description = "Superscripts and Subscripts" },
515    ["supplementalarrowsa"]                         = { first = 0x027F0, last = 0x027FF, math = true, description = "Supplemental Arrows-A" },
516    ["supplementalarrowsb"]                         = { first = 0x02900, last = 0x0297F, math = true, description = "Supplemental Arrows-B" },
517    ["supplementalarrowsc"]                         = { first = 0x1F800, last = 0x1F8FF, math = true, description = "Supplemental Arrows-C" },
518    ["supplementalmathematicaloperators"]           = { first = 0x02A00, last = 0x02AFF, math = true, description = "Supplemental Mathematical Operators" },
519    ["supplementalpunctuation"]                     = { first = 0x02E00, last = 0x02E7F,              description = "Supplemental Punctuation" },
520    ["supplementalsymbolsandpictographs"]           = { first = 0x1F900, last = 0x1F9FF,              description = "Supplemental Symbols and Pictographs" },
521    ["supplementaryprivateuseareaa"]                = { first = 0xF0000, last = 0xFFFFF,              description = "Supplementary Private Use Area-A" },
522    ["supplementaryprivateuseareab"]                = { first = 0x100000,last = 0x10FFFF,             description = "Supplementary Private Use Area-B" },
523    ["suttonsignwriting"]                           = { first = 0x1D800, last = 0x1DAAF,              description = "Sutton SignWriting" },
524    ["sylotinagri"]                                 = { first = 0x0A800, last = 0x0A82F, otf="sylo",  description = "Syloti Nagri" },
525    ["symbolsandpictographsextendeda"]              = { first = 0x1FA70, last = 0x1FAFF,              description = "Symbols and Pictographs Extended-A" },
526    ["symbolsforlegacycomputing"]                   = { first = 0x1FB00, last = 0x1FBFF,              description = "Symbols for Legacy Computing" },
527    ["syriac"]                                      = { first = 0x00700, last = 0x0074F, otf="syrc",  description = "Syriac" },
528    ["syriacsupplement"]                            = { first = 0x00860, last = 0x0086F,              description = "Syriac Supplement" },
529    ["tagalog"]                                     = { first = 0x01700, last = 0x0171F, otf="tglg",  description = "Tagalog" },
530    ["tagbanwa"]                                    = { first = 0x01760, last = 0x0177F, otf="tagb",  description = "Tagbanwa" },
531    ["tags"]                                        = { first = 0xE0000, last = 0xE007F,              description = "Tags" },
532    ["taile"]                                       = { first = 0x01950, last = 0x0197F, otf="tale",  description = "Tai Le" },
533    ["taitham"]                                     = { first = 0x01A20, last = 0x01AAF,              description = "Tai Tham" },
534    ["taiviet"]                                     = { first = 0x0AA80, last = 0x0AADF,              description = "Tai Viet" },
535    ["taixuanjingsymbols"]                          = { first = 0x1D300, last = 0x1D35F,              description = "Tai Xuan Jing Symbols" },
536    ["takri"]                                       = { first = 0x11680, last = 0x116CF,              description = "Takri" },
537    ["tamil"]                                       = { first = 0x00B80, last = 0x00BFF, otf="taml",  description = "Tamil" },
538    ["tamilsupplement"]                             = { first = 0x11FC0, last = 0x11FFF,              description = "Tamil Supplement" },
539    ["tangut"]                                      = { first = 0x17000, last = 0x187FF,              description = "Tangut" },
540    ["tangutsupplement"]                            = { first = 0x18D00, last = 0x18D7F,              description = "Tangut Supplement" },
541    ["tangutcomponents"]                            = { first = 0x18800, last = 0x18AFF,              description = "Tangut Components" },
542    ["tangsa"]                                      = { first = 0x16A70, last = 0x16ACF,              description = "Tangsa" },
543    ["telugu"]                                      = { first = 0x00C00, last = 0x00C7F, otf="telu",  description = "Telugu" },
544    ["thaana"]                                      = { first = 0x00780, last = 0x007BF, otf="thaa",  description = "Thaana" },
545    ["thai"]                                        = { first = 0x00E00, last = 0x00E7F, otf="thai",  description = "Thai" },
546    ["tibetan"]                                     = { first = 0x00F00, last = 0x00FFF, otf="tibt",  description = "Tibetan" },
547    ["tifinagh"]                                    = { first = 0x02D30, last = 0x02D7F, otf="tfng",  description = "Tifinagh" },
548    ["tirhuta"]                                     = { first = 0x11480, last = 0x114DF,              description = "Tirhuta" },
549    ["toto"]                                        = { first = 0x1E290, last = 0x1E2BF,              description = "Toto" },
550    ["transportandmapsymbols"]                      = { first = 0x1F680, last = 0x1F6FF,              description = "Transport and Map Symbols" },
551    ["ugaritic"]                                    = { first = 0x10380, last = 0x1039F, otf="ugar",  description = "Ugaritic" },
552    ["unifiedcanadianaboriginalsyllabics"]          = { first = 0x01400, last = 0x0167F, otf="cans",  description = "Unified Canadian Aboriginal Syllabics" },
553    ["unifiedcanadianaboriginalsyllabicsextended"]  = { first = 0x018B0, last = 0x018FF,              description = "Unified Canadian Aboriginal Syllabics Extended" },
554    ["unifiedcanadianaboriginalsyllabicsextendeda"] = { first = 0x11AB0, last = 0x11ABF,              description = "Unified Canadian Aboriginal Syllabics Extended-A" },
555    ["uppercasebold"]                               = { first = 0x1D400, last = 0x1D419, math = true },
556    ["uppercaseboldfraktur"]                        = { first = 0x1D56C, last = 0x1D585, math = true },
557    ["uppercasebolditalic"]                         = { first = 0x1D468, last = 0x1D481, math = true },
558    ["uppercaseboldscript"]                         = { first = 0x1D4D0, last = 0x1D4E9, math = true },
559    ["uppercasedoublestruck"]                       = { first = 0x1D538, last = 0x1D551, math = true }, -- gaps are filled in elsewhere
560    ["uppercasefraktur"]                            = { first = 0x1D504, last = 0x1D51D, math = true },
561    ["uppercasegreekbold"]                          = { first = 0x1D6A8, last = 0x1D6C1, math = true },
562    ["uppercasegreekbolditalic"]                    = { first = 0x1D71C, last = 0x1D735, math = true },
563    ["uppercasegreekitalic"]                        = { first = 0x1D6E2, last = 0x1D6FB, math = true },
564    ["uppercasegreeknormal"]                        = { first = 0x00391, last = 0x003AA, math = true },
565    ["uppercasegreeksansserifbold"]                 = { first = 0x1D756, last = 0x1D76F, math = true },
566    ["uppercasegreeksansserifbolditalic"]           = { first = 0x1D790, last = 0x1D7A9, math = true },
567    ["uppercaseitalic"]                             = { first = 0x1D434, last = 0x1D44D, math = true },
568    ["uppercasemonospace"]                          = { first = 0x1D670, last = 0x1D689, math = true },
569    ["uppercasenormal"]                             = { first = 0x00041, last = 0x0005A, math = true },
570    ["uppercasesansserifbold"]                      = { first = 0x1D5D4, last = 0x1D5ED, math = true },
571    ["uppercasesansserifbolditalic"]                = { first = 0x1D63C, last = 0x1D655, math = true },
572    ["uppercasesansserifitalic"]                    = { first = 0x1D608, last = 0x1D621, math = true },
573    ["uppercasesansserifnormal"]                    = { first = 0x1D5A0, last = 0x1D5B9, math = true },
574    ["uppercasescript"]                             = { first = 0x1D49C, last = 0x1D4B5, math = true },
575    ["vai"]                                         = { first = 0x0A500, last = 0x0A63F,              description = "Vai" },
576    ["variationselectors"]                          = { first = 0x0FE00, last = 0x0FE0F,              description = "Variation Selectors" },
577    ["variationselectorssupplement"]                = { first = 0xE0100, last = 0xE01EF,              description = "Variation Selectors Supplement" },
578    ["vedicextensions"]                             = { first = 0x01CD0, last = 0x01CFF,              description = "Vedic Extensions" },
579    ["verticalforms"]                               = { first = 0x0FE10, last = 0x0FE1F,              description = "Vertical Forms" },
580    ["vithkuqi"]                                    = { first = 0x10570, last = 0x105BF,              description = "Vithkuqi" },
581    ["wancho"]                                      = { first = 0x1E2C0, last = 0x1E2FF,              description = "Wancho" },
582    ["warangciti"]                                  = { first = 0x118A0, last = 0x118FF,              description = "Warang Citi" },
583    ["yezidi"]                                      = { first = 0x10E80, last = 0x10EBF,              description = "Yezidi" },
584    ["yijinghexagramsymbols"]                       = { first = 0x04DC0, last = 0x04DFF, otf="yi",    description = "Yijing Hexagram Symbols" },
585    ["yiradicals"]                                  = { first = 0x0A490, last = 0x0A4CF, otf="yi",    description = "Yi Radicals" },
586    ["yisyllables"]                                 = { first = 0x0A000, last = 0x0A48F, otf="yi",    description = "Yi Syllables" },
587    ["zanabazarsquare"]                             = { first = 0x11A00, last = 0x11A4F,              description = "Zanabazar Square" },
588    ["znamennymusicalnotation"]                     = { first = 0x1CF00, last = 0x1CFCF,              description = "Znamenny Musical Notation" }
589}
590
591-- moved from math-act.lua to here:
592
593-- operators    : 0x02200
594-- symbolsa     : 0x02701
595-- symbolsb     : 0x02901
596-- supplemental : 0x02A00
597
598blocks.lowercaseitalic.gaps = {
599    [0x1D455] = 0x0210E, -- ℎ h
600}
601
602blocks.uppercasescript.gaps = {
603    [0x1D49D] = 0x0212C, -- ℬ script B
604    [0x1D4A0] = 0x02130, -- ℰ script E
605    [0x1D4A1] = 0x02131, -- ℱ script F
606    [0x1D4A3] = 0x0210B, -- ℋ script H
607    [0x1D4A4] = 0x02110, -- ℐ script I
608    [0x1D4A7] = 0x02112, -- ℒ script L
609    [0x1D4A8] = 0x02133, -- ℳ script M
610    [0x1D4AD] = 0x0211B, -- ℛ script R
611}
612
613blocks.lowercasescript.gaps = {
614    [0x1D4BA] = 0x0212F, -- ℯ script e
615    [0x1D4BC] = 0x0210A, -- ℊ script g
616    [0x1D4C4] = 0x02134, -- ℴ script o
617}
618
619blocks.uppercasefraktur.gaps = {
620    [0x1D506] = 0x0212D, -- ℭ fraktur C
621    [0x1D50B] = 0x0210C, -- ℌ fraktur H
622    [0x1D50C] = 0x02111, -- ℑ fraktur I
623    [0x1D515] = 0x0211C, -- ℜ fraktur R
624    [0x1D51D] = 0x02128, -- ℨ fraktur Z
625}
626
627blocks.uppercasedoublestruck.gaps = {
628    [0x1D53A] = 0x02102, -- ℂ bb C
629    [0x1D53F] = 0x0210D, -- ℍ bb H
630    [0x1D545] = 0x02115, -- ℕ bb N
631    [0x1D547] = 0x02119, -- ℙ bb P
632    [0x1D548] = 0x0211A, -- ℚ bb Q
633    [0x1D549] = 0x0211D, -- ℝ bb R
634    [0x1D551] = 0x02124, -- ℤ bb Z
635}
636
637characters.blocks = blocks
638
639function characters.blockrange(name)
640    local b = blocks[name]
641    if b then
642        return b.first, b.last
643    else
644        return 0, 0
645    end
646end
647
648setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
649    return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
650end)
651
652local otfscripts      = utilities.storage.allocate()
653characters.otfscripts = otfscripts
654
655setmetatableindex(otfscripts,function(t,unicode)
656    for k, v in next, blocks do
657        local first = v.first
658        local last  = v.last
659        if unicode >= first and unicode <= last then
660            local script = v.otf or "dflt"
661            for u=first,last do
662                t[u] = script
663            end
664            return script
665        end
666    end
667    -- pretty slow when we're here
668    t[unicode] = "dflt"
669    return "dflt"
670end)
671
672local splitter1 = lpeg.splitat(S(":-"))
673local splitter2 = lpeg.splitat(S(" +-"),true)
674
675function characters.getrange(name,expression) -- used in font fallback definitions (name or range)
676    local range = rawget(blocks,lower(gsub(name,"[^a-zA-Z0-9]","")))
677    if range then
678        return range.first, range.last, range.description, range.gaps
679    end
680    name = gsub(name,'"',"0x") -- goodie: tex hex notation
681    local start, stop
682    if expression then
683        local n = tonumber(name)
684        if n then
685            return n, n, nil
686        else
687            local first, rest = lpegmatch(splitter2,name)
688            local range = rawget(blocks,lower(gsub(first,"[^a-zA-Z0-9]","")))
689            if range then
690                local s = loadstring("return 0 " .. rest)
691                if type(s) == "function" then
692                    local d = s()
693                    if type(d) == "number" then
694                        return range.first + d, range.last + d, nil
695                    end
696                end
697            end
698        end
699    end
700    local start, stop = lpegmatch(splitter1,name)
701    if start and stop then
702        start = tonumber(start,16) or tonumber(start)
703        stop  = tonumber(stop, 16) or tonumber(stop)
704        if start and stop then
705            return start, stop, nil
706        end
707    end
708    local slot = tonumber(name,16) or tonumber(name)
709    return slot, slot, nil
710end
711
712-- print(characters.getrange("lowercaseitalic + 123",true))
713-- print(characters.getrange("lowercaseitalic + 124",true))
714
715local categorytags = allocate {
716    lu = "Letter Uppercase",
717    ll = "Letter Lowercase",
718    lt = "Letter Titlecase",
719    lm = "Letter Modifier",
720    lo = "Letter Other",
721    mn = "Mark Nonspacing",
722    mc = "Mark Spacing Combining",
723    me = "Mark Enclosing",
724    nd = "Number Decimal Digit",
725    nl = "Number Letter",
726    no = "Number Other",
727    pc = "Punctuation Connector",
728    pd = "Punctuation Dash",
729    ps = "Punctuation Open",
730    pe = "Punctuation Close",
731    pi = "Punctuation Initial Quote",
732    pf = "Punctuation Final Quote",
733    po = "Punctuation Other",
734    sm = "Symbol Math",
735    sc = "Symbol Currency",
736    sk = "Symbol Modifier",
737    so = "Symbol Other",
738    zs = "Separator Space",
739    zl = "Separator Line",
740    zp = "Separator Paragraph",
741    cc = "Other Control",
742    cf = "Other Format",
743    cs = "Other Surrogate",
744    co = "Other Private Use",
745    cn = "Other Not Assigned",
746}
747
748local detailtags = allocate {
749    sl = "small letter",
750    bl = "big letter",
751    im = "iteration mark",
752    pm = "prolonged sound mark"
753}
754
755characters.categorytags = categorytags
756characters.detailtags   = detailtags
757
758-- sounds : voiced unvoiced semivoiced
759
760--~ special   : cf (softhyphen) zs (emspace)
761--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
762
763local is_character = allocate ( tohash {
764    "lu","ll","lt","lm","lo",
765    "nd","nl","no",
766    "mn",
767    "nl","no",
768    "pc","pd","ps","pe","pi","pf","po",
769    "sm","sc","sk","so"
770} )
771
772local is_letter = allocate ( tohash {
773    "ll","lm","lo","lt","lu"
774} )
775
776local is_command = allocate ( tohash {
777    "cf","zs"
778} )
779
780local is_spacing = allocate ( tohash {
781    "zs", "zl","zp",
782} )
783
784local is_mark = allocate ( tohash {
785    "mn", "ms", -- "mn", "mc",
786} )
787
788local is_punctuation = allocate ( tohash {
789    "pc", "pd", "ps", "pe", "pi", "pf", "po",
790} )
791
792local is_hyphenator = allocate ( tohash {
793    "pd",
794} )
795
796local is_symbol = allocate ( tohash {
797    "sm", "sc", "sk", "so",
798} )
799
800-- to be redone: store checked characters
801
802characters.is_character   = is_character
803characters.is_letter      = is_letter
804characters.is_command     = is_command
805characters.is_spacing     = is_spacing
806characters.is_mark        = is_mark
807characters.is_punctuation = is_punctuation
808characters.is_hyphenator  = is_hyphenator
809characters.is_symbol      = is_symbol
810
811local mti = function(t,k)
812    if type(k) == "number" then
813        local c = data[k].category
814        return c and rawget(t,c)
815    else
816        -- avoid auto conversion in data.characters lookups
817    end
818end
819
820setmetatableindex(characters.is_character,  mti)
821setmetatableindex(characters.is_letter,     mti)
822setmetatableindex(characters.is_command,    mti)
823setmetatableindex(characters.is_spacing,    mti)
824setmetatableindex(characters.is_punctuation,mti)
825setmetatableindex(characters.is_hyphenator, mti)
826
827-- todo: also define callers for the above
828
829-- linebreak: todo: hash
830--
831-- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
832-- informative : XX OP CL CP QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 HL CJ RI
833--
834-- comments taken from standard:
835
836characters.linebreaks = allocate {
837
838    -- non-tailorable line breaking classes
839
840    ["bk"]  = "mandatory break",                             -- nl, ps : cause a line break (after)
841    ["cr"]  = "carriage return",                             -- cr : cause a line break (after), except between cr and lf
842    ["lf"]  = "line feed",                                   -- lf : cause a line break (after)
843    ["cm"]  = "combining mark",                              -- combining marks, control codes : prohibit a line break between the character and the preceding character
844    ["nl"]  = "next line",                                   -- nel : cause a line break (after)
845    ["sg"]  = "surrogate",                                   -- surrogates :do not occur in well-formed text
846    ["wj"]  = "word joiner",                                 -- wj : prohibit line breaks before and after
847    ["zw"]  = "zero width space",                            -- zwsp : provide a break opportunity
848    ["gl"]  = "non-breaking (glue)",                         -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
849    ["sp"]  = "space",                                       -- space : enable indirect line breaks
850    ["zwj"] = "zero width joiner",                           -- prohibit line breaks within joiner sequences
851
852    -- break opportunities
853
854    ["b2"] = "break opportunity before and after",           -- em dash : provide a line break opportunity before and after the character
855    ["ba"] = "break after",                                  -- spaces, hyphens : generally provide a line break opportunity after the character
856    ["bb"] = "break before",                                 -- punctuation used in dictionaries : generally provide a line break opportunity before the character
857    ["hy"] = "hyphen",                                       -- hyphen-minus : provide a line break opportunity after the character, except in numeric context
858    ["cb"] = "contingent break opportunity",                 -- inline objects : provide a line break opportunity contingent on additional information
859
860    -- characters prohibiting certain breaks
861
862    ["cl"] = "close punctuation",                            -- “}”, “❳”, “⟫” etc. : prohibit line breaks before
863    ["cp"] = "close parenthesis",                            -- “)”, “]” : prohibit line breaks before
864    ["ex"] = "exclamation/interrogation",                    -- “!”, “?”, etc. : prohibit line breaks before
865    ["in"] = "inseparable",                                  -- leaders : allow only indirect line breaks between pairs
866    ["ns"] = "nonstarter",                                   -- “‼”, “‽”, “⁇”, “⁉”, etc. : allow only indirect line breaks before
867    ["op"] = "open punctuation",                             -- “(“, “[“, “{“, etc. : prohibit line breaks after
868    ["qu"] = "quotation",                                    -- quotation marks : act like they are both opening and closing
869
870    -- numeric context
871
872    ["is"] = "infix numeric separator",                      -- . , : prevent breaks after any and before numeric
873    ["nu"] = "numeric",                                      -- digits : form numeric expressions for line breaking purposes
874    ["po"] = "postfix numeric",                              -- %, ¢ : do not break following a numeric expression
875    ["pr"] = "prefix numeric",                               -- $, £, ¥, etc. : do not break in front of a numeric expression
876    ["sy"] = "symbols allowing break after",                 -- / : prevent a break before, and allow a break after
877
878    -- other characters
879
880    ["ai"] = "ambiguous (alphabetic or ideographic)",        -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
881    ["al"] = "alphabetic",                                   -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
882    ["cj"] = "conditional japanese starter",                 -- small kana : treat as ns or id for strict or normal breaking.
883    ["eb"] = "emoji base",                                   -- all emoji allowing modifiers, do not break from following emoji modifier
884    ["em"] = "emoji modifier",                               -- skin tone modifiers, do not break from preceding emoji base
885    ["h2"] = "hangul lv syllable",                           -- hangul : form korean syllable blocks
886    ["h3"] = "hangul lvt syllable",                          -- hangul : form korean syllable blocks
887    ["hl"] = "hebrew letter",                                -- hebrew : do not break around a following hyphen; otherwise act as alphabetic
888    ["id"] = "ideographic",                                  -- ideographs : break before or after, except in some numeric context
889    ["jl"] = "hangul l jamo",                                -- conjoining jamo : form korean syllable blocks
890    ["jv"] = "hangul v jamo",                                -- conjoining jamo : form korean syllable blocks
891    ["jt"] = "hangul t jamo",                                -- conjoining jamo : form korean syllable blocks
892    ["ri"] = "regional indicator",                           -- regional indicator symbol letter a .. z : keep together, break before and after from others
893    ["sa"] = "complex context dependent (south east asian)", -- south east asian: thai, lao, khmer : provide a line break opportunity contingent on additional, language-specific context analysis
894    ["xx"] = "unknown",                                      -- most unassigned, private-use : have as yet unknown line breaking behavior or unassigned code positions
895
896}
897
898-- east asian width:
899--
900-- N A H W F Na
901
902characters.bidi = allocate {
903    l   = "Left-to-Right",
904    lre = "Left-to-Right Embedding",
905    lro = "Left-to-Right Override",
906    r   = "Right-to-Left",
907    al  = "Right-to-Left Arabic",
908    rle = "Right-to-Left Embedding",
909    rlo = "Right-to-Left Override",
910    pdf = "Pop Directional Format",
911    en  = "European Number",
912    es  = "European Number Separator",
913    et  = "European Number Terminator",
914    an  = "Arabic Number",
915    cs  = "Common Number Separator",
916    nsm = "Non-Spacing Mark",
917    bn  = "Boundary Neutral",
918    b   = "Paragraph Separator",
919    s   = "Segment Separator",
920    ws  = "Whitespace",
921    on  = "Other Neutrals",
922}
923
924--[[ldx--
925<p>At this point we assume that the big data table is loaded. From this
926table we derive a few more.</p>
927--ldx]]--
928
929if not characters.fallbacks then
930
931    characters.fallbacks = allocate {
932        [0x0308] = 0x00A8, [0x00A8] = 0x0308, -- dieresiscmb      dieresis
933        [0x0304] = 0x00AF, [0x00AF] = 0x0304, -- macroncmb        macron
934        [0x0301] = 0x00B4, [0x00B4] = 0x0301, -- acutecomb        acute
935        [0x0327] = 0x00B8, [0x00B8] = 0x0327, -- cedillacmb       cedilla
936        [0x0302] = 0x02C6, [0x02C6] = 0x0302, -- circumflexcmb    circumflex
937        [0x030C] = 0x02C7, [0x02C7] = 0x030C, -- caroncmb         caron
938        [0x0306] = 0x02D8, [0x02D8] = 0x0306, -- brevecmb         breve
939        [0x0307] = 0x02D9, [0x02D9] = 0x0307, -- dotaccentcmb     dotaccent
940        [0x030A] = 0x02DA, [0x02DA] = 0x030A, -- ringcmb          ring
941        [0x0328] = 0x02DB, [0x02DB] = 0x0328, -- ogonekcmb        ogonek
942        [0x0303] = 0x02DC, [0x02DC] = 0x0303, -- tildecomb        tilde
943        [0x030B] = 0x02DD, [0x02DD] = 0x030B, -- hungarumlautcmb  hungarumlaut
944        [0x0305] = 0x203E, [0x203E] = 0x0305, -- overlinecmb      overline
945        [0x0300] = 0x0060, [0x0060] = 0x0333, -- gravecomb        grave
946    }
947
948    -- not done (would mess up mapping):
949    --
950    -- 0X0301/0X0384 0X0314/0X1FFE 0X0313/0X1FBD 0X0313/0X1FBF 0X0342/0X1FC0
951    -- 0X3099/0X309B 0X309A/0X309C 0X0333/0X2017 0X0345/0X037A
952
953end
954
955if storage then -- in case we extend
956    storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
957end
958
959characters.directions  = { }
960
961setmetatableindex(characters.directions,function(t,k)
962    local d = data[k]
963    if d then
964        local v = d.direction
965        if v then
966            t[k] = v
967            return v
968        end
969    end
970    t[k] = false -- maybe 'l'
971    return false
972end)
973
974characters.mirrors  = { }
975
976setmetatableindex(characters.mirrors,function(t,k)
977    local d = data[k]
978    if d then
979        local v = d.mirror
980        if v then
981            t[k] = v
982            return v
983        end
984    end
985    t[k] = false
986    return false
987end)
988
989characters.textclasses  = { }
990
991setmetatableindex(characters.textclasses,function(t,k)
992    local d = data[k]
993    if d then
994        local v = d.textclass
995        if v then
996            t[k] = v
997            return v
998        end
999    end
1000    t[k] = false
1001    return false
1002end)
1003
1004--[[ldx--
1005<p>Next comes a whole series of helper methods. These are (will be) part
1006of the official <l n='api'/>.</p>
1007--ldx]]--
1008
1009-- we could make them virtual: characters.contextnames[n]
1010
1011function characters.contextname(n) return data[n] and data[n].contextname or "" end
1012function characters.adobename  (n) return data[n] and data[n].adobename   or "" end
1013function characters.description(n) return data[n] and data[n].description or "" end
1014-------- characters.category   (n) return data[n] and data[n].category    or "" end
1015
1016function characters.category(n,verbose)
1017    local c = data[n].category
1018    if not c then
1019        return ""
1020    elseif verbose then
1021        return categorytags[c]
1022    else
1023        return c
1024    end
1025end
1026
1027-- -- some day we will make a table .. not that many calls to utfchar
1028--
1029-- local utfchar = utf.char
1030-- local utfbyte = utf.byte
1031-- local utfbytes = { }
1032-- local utfchars = { }
1033--
1034-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
1035-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
1036
1037local function toutfstring(s)
1038    if type(s) == "table" then
1039        return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
1040    else
1041        return utfchar(s)
1042    end
1043end
1044
1045utf.tostring = toutfstring
1046
1047local categories = allocate()  characters.categories = categories -- lazy table
1048
1049setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
1050
1051-- todo: overloads (these register directly in the tables as number and string) e.g. for greek
1052-- todo: for string do a numeric lookup in the table itself
1053
1054local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
1055local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
1056local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
1057local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
1058
1059setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1060setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1061setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1062setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1063
1064local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
1065local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
1066local shchars = allocate()  characters.shchars = shchars -- lazy table
1067local fschars = allocate()  characters.fschars = fschars -- lazy table
1068
1069setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1070setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1071setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1072setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1073
1074local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
1075local specials   = allocate()  characters.specials   = specials     -- lazy table
1076
1077setmetatableindex(decomposed, function(t,u) -- either a table or false
1078    if u then
1079        local c = data[u]
1080        local s = c and c.decomposed or false -- could fall back to specials
1081        t[u] = s
1082        return s
1083    end
1084end)
1085
1086setmetatableindex(specials, function(t,u) -- either a table or false
1087    if u then
1088        local c = data[u]
1089        local s = c and c.specials or false
1090        t[u] = s
1091        return s
1092    end
1093end)
1094
1095local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
1096local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
1097local synonyms     = allocate()  characters.synonyms     = synonyms     -- lazy table
1098
1099setmetatableindex(specialchars, function(t,u)
1100    if u then
1101        local c = data[u]
1102        local s = c and c.specials
1103        if s then
1104            local tt  = { }
1105            local ttn = 0
1106            for i=2,#s do
1107                local si = s[i]
1108                local c = data[si]
1109                if is_letter[c.category] then
1110                    ttn = ttn + 1
1111                    tt[ttn] = utfchar(si)
1112                end
1113            end
1114            c = concat(tt)
1115            t[u] = c
1116            return c
1117        else
1118            if type(u) == "number" then
1119                u = utfchar(u)
1120            end
1121            t[u] = u
1122            return u
1123        end
1124    end
1125end)
1126
1127setmetatableindex(descriptions, function(t,k)
1128    -- 0.05 - 0.10 sec
1129    for u, c in next, data do
1130        local d = c.description
1131        if d then
1132            if find(d," ",1,true) then
1133                d = gsub(d," ","")
1134            end
1135            d = lower(d)
1136            t[d] = u
1137        end
1138    end
1139    local d = rawget(t,k)
1140    if not d then
1141        t[k] = k
1142    end
1143    return d
1144end)
1145
1146setmetatableindex(synonyms, function(t,k)
1147    for u, c in next, data do
1148        local s = c.synonyms
1149        if s then
1150            if find(s," ",1,true) then
1151                s = gsub(s," ","")
1152            end
1153         -- s = lower(s) -- is already lowercase
1154            t[s] = u
1155        end
1156    end
1157    local s = rawget(t,k)
1158    if not s then
1159        t[s] = s
1160    end
1161    return s
1162end)
1163
1164function characters.unicodechar(asked)
1165    local n = tonumber(asked)
1166    if n then
1167        return n
1168    elseif type(asked) == "string" then
1169        return descriptions[asked] or descriptions[gsub(asked," ","")]
1170    end
1171end
1172
1173-- function characters.lower(str)
1174--     local new, n = { }, 0
1175--     for u in utfvalues(str) do
1176--         n = n + 1
1177--         new[n] = lcchars[u]
1178--     end
1179--     return concat(new)
1180-- end
1181--
1182-- function characters.upper(str)
1183--     local new, n = { }, 0
1184--     for u in utfvalues(str) do
1185--         n = n + 1
1186--         new[n] = ucchars[u]
1187--     end
1188--     return concat(new)
1189-- end
1190--
1191-- function characters.shaped(str)
1192--     local new, n = { }, 0
1193--     for u in utfvalues(str) do
1194--         n = n + 1
1195--         new[n] = shchars[u]
1196--     end
1197--     return concat(new)
1198-- end
1199
1200----- tolower = Cs((utf8byte/lcchars)^0)
1201----- toupper = Cs((utf8byte/ucchars)^0)
1202----- toshape = Cs((utf8byte/shchars)^0)
1203
1204local tolower = Cs((utf8character/lcchars)^0) -- no need to check spacing
1205local toupper = Cs((utf8character/ucchars)^0) -- no need to check spacing
1206local toshape = Cs((utf8character/shchars)^0) -- no need to check spacing
1207
1208lpegpatterns.tolower = tolower -- old ones ... will be overloaded
1209lpegpatterns.toupper = toupper -- old ones ... will be overloaded
1210lpegpatterns.toshape = toshape -- old ones ... will be overloaded
1211
1212-- function characters.lower (str) return lpegmatch(tolower,str) end
1213-- function characters.upper (str) return lpegmatch(toupper,str) end
1214-- function characters.shaped(str) return lpegmatch(toshape,str) end
1215
1216--     local superscripts = allocate()   characters.superscripts = superscripts
1217--     local subscripts   = allocate()   characters.subscripts   = subscripts
1218
1219--     if storage then
1220--         storage.register("characters/superscripts", superscripts, "characters.superscripts")
1221--         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1222--     end
1223
1224-- end
1225
1226if not characters.splits then
1227
1228    local char   = allocate()
1229    local compat = allocate()
1230
1231    local splits = {
1232        char   = char,
1233        compat = compat,
1234    }
1235
1236    characters.splits = splits
1237
1238    -- [0x013F] = { 0x004C, 0x00B7 }
1239    -- [0x0140] = { 0x006C, 0x00B7 }
1240
1241    for unicode, data in next, characters.data do
1242        local specials = data.specials
1243        if specials and #specials > 2 then
1244            local kind = specials[1]
1245            if kind == "compat" then
1246                compat[unicode] = { unpack(specials,2) }
1247            elseif kind == "char" or kind == "with" then -- width added
1248                char  [unicode] = { unpack(specials,2) }
1249            end
1250        end
1251    end
1252
1253    if storage then
1254        storage.register("characters/splits", splits, "characters.splits")
1255    end
1256
1257end
1258
1259if not characters.lhash then
1260
1261    local lhash = allocate()   characters.lhash = lhash -- nil if no conversion
1262    local uhash = allocate()   characters.uhash = uhash -- nil if no conversion
1263    local shash = allocate()   characters.shash = shash -- nil if no conversion
1264
1265    for k, v in next, characters.data do
1266     -- if k < 0x11000 then
1267            local l = v.lccode
1268            if l then
1269                -- we have an uppercase
1270                if type(l) == "number" then
1271                    lhash[utfchar(k)] = utfchar(l)
1272                elseif #l == 2 then
1273                    lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2])
1274             -- else
1275             --     inspect(v)
1276                end
1277            else
1278                local u = v.uccode
1279                if u then
1280                    -- we have an lowercase
1281                    if type(u) == "number" then
1282                        uhash[utfchar(k)] = utfchar(u)
1283                    elseif #u == 2 then
1284                        uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2])
1285                 -- else
1286                 --     inspect(v)
1287                    end
1288                end
1289            end
1290            local s = v.shcode
1291            if s then
1292                if type(s) == "number" then
1293                    shash[utfchar(k)] = utfchar(s)
1294                elseif #s == 2 then
1295                    shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2])
1296             -- else
1297             --     inspect(v)
1298                end
1299            end
1300     -- end
1301    end
1302
1303    if storage then
1304        storage.register("characters/lhash", lhash, "characters.lhash")
1305        storage.register("characters/uhash", uhash, "characters.uhash")
1306        storage.register("characters/shash", shash, "characters.shash")
1307    end
1308
1309end
1310
1311local lhash = characters.lhash mark(lhash)
1312local uhash = characters.uhash mark(uhash)
1313local shash = characters.shash mark(shash)
1314
1315local utf8lowercharacter = utfchartabletopattern(lhash) / lhash
1316local utf8uppercharacter = utfchartabletopattern(uhash) / uhash
1317local utf8shapecharacter = utfchartabletopattern(shash) / shash
1318
1319local utf8lower = Cs((utf8lowercharacter + utf8character)^0)
1320local utf8upper = Cs((utf8uppercharacter + utf8character)^0)
1321local utf8shape = Cs((utf8shapecharacter + utf8character)^0)
1322
1323lpegpatterns.utf8lowercharacter = utf8lowercharacter -- one character
1324lpegpatterns.utf8uppercharacter = utf8uppercharacter -- one character
1325lpegpatterns.utf8shapecharacter = utf8shapecharacter -- one character
1326
1327lpegpatterns.utf8lower = utf8lower -- string
1328lpegpatterns.utf8upper = utf8upper -- string
1329lpegpatterns.utf8shape = utf8shape -- string
1330
1331function characters.lower (str) return str and lpegmatch(utf8lower,str) or "" end
1332function characters.upper (str) return str and lpegmatch(utf8upper,str) or "" end
1333function characters.shaped(str) return str and lpegmatch(utf8shape,str) or "" end
1334
1335lpeg.setutfcasers(characters.lower,characters.upper)
1336
1337-- local str = [[
1338--     ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
1339--     ÆÇæç         æçæç         ÆÇÆÇ         AECaec
1340--     ÈÉÊËèéêë     èéêëèéêë     ÈÉÊËÈÉÊË     EEEEeeee
1341--     ÌÍÎÏÞìíîïþ   ìíîïþìíîïþ   ÌÍÎÏÞÌÍÎÏÞ   IIIIÞiiiiþ
1342--     Ðð           ðð           ÐÐ           Ðð
1343--     Ññ           ññ           ÑÑ           Nn
1344--     ÒÓÔÕÖòóôõö   òóôõöòóôõö   ÒÓÔÕÖÒÓÔÕÖ   OOOOOooooo
1345--     Øø           øø           ØØ           Oo
1346--     ÙÚÛÜùúûü     ùúûüùúûü     ÙÚÛÜÙÚÛÜ     UUUUuuuu
1347--     Ýýÿ          ýýÿ          ÝÝŸ          Yyy
1348--     ß            ß            SS           ss
1349--     Ţţ           ţţ           ŢŢ           Tt
1350-- ]]
1351--
1352-- local lower  = characters.lower   print(lower(str))
1353-- local upper  = characters.upper   print(upper(str))
1354-- local shaped = characters.shaped  print(shaped(str))
1355--
1356-- local c, n = os.clock(), 10000
1357-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
1358-- print(os.clock()-c,n*#str*3)
1359
1360-- maybe: (twice as fast when much ascii)
1361--
1362-- local tolower  = lpeg.patterns.tolower
1363-- local lower    = string.lower
1364--
1365-- local allascii = R("\000\127")^1 * P(-1)
1366--
1367-- function characters.checkedlower(str)
1368--     return lpegmatch(allascii,str) and lower(str) or lpegmatch(tolower,str) or str
1369-- end
1370
1371function characters.lettered(str,spacing)
1372    local new, n = { }, 0
1373    if spacing then
1374        local done = false
1375        for u in utfvalues(str) do
1376            local c = data[u].category
1377            if is_letter[c] then
1378                if done and n > 1 then
1379                    n = n + 1
1380                    new[n] = " "
1381                    done = false
1382                end
1383                n = n + 1
1384                new[n] = utfchar(u)
1385            elseif spacing and is_spacing[c] then
1386                done = true
1387            end
1388        end
1389    else
1390        for u in utfvalues(str) do
1391            if is_letter[data[u].category] then
1392                n = n + 1
1393                new[n] = utfchar(u)
1394            end
1395        end
1396    end
1397    return concat(new)
1398end
1399
1400--[[ldx--
1401<p>Requesting lower and uppercase codes:</p>
1402--ldx]]--
1403
1404function characters.uccode(n) return uccodes[n] end -- obsolete
1405function characters.lccode(n) return lccodes[n] end -- obsolete
1406
1407function characters.shape(n)
1408    local shcode = shcodes[n]
1409    if not shcode then
1410        return n, nil
1411    elseif type(shcode) == "table" then
1412        return shcode[1], shcode[#shcode]
1413    else
1414        return shcode, nil
1415    end
1416end
1417
1418-- -- some day we might go this route, but it does not really save that much
1419-- -- so not now (we can generate a lot using mtx-unicode that operates on the
1420-- -- database)
1421--
1422-- -- category cjkwd direction linebreak
1423--
1424-- -- adobename comment contextcommand contextname description fallback lccode
1425-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
1426-- -- range shcode specials uccode uccodes unicodeslot
1427--
1428-- local data = {
1429--     ['one']={
1430--         common = {
1431--             category="cc",
1432--             direction="bn",
1433--             linebreak="cm",
1434--         },
1435--         vector = {
1436--             [0x0000] = {
1437--                 description="NULL",
1438--                 group='one',
1439--                 unicodeslot=0x0000,
1440--             },
1441--             {
1442--                 description="START OF HEADING",
1443--                 group='one',
1444--                 unicodeslot=0x0001,
1445--             },
1446--         }
1447--     }
1448-- }
1449--
1450-- local chardata, groupdata = { }, { }
1451--
1452-- for group, gdata in next, data do
1453--     local common, vector = { __index = gdata.common }, gdata.vector
1454--     for character, cdata in next, vector do
1455--         chardata[character] = cdata
1456--         setmetatable(cdata,common)
1457--     end
1458--     groupdata[group] = gdata
1459-- end
1460
1461-- characters.data, characters.groups = chardata, groupdata
1462
1463--  [0xF0000]={
1464--   category="co",
1465--   cjkwd="a",
1466--   description="<Plane 0x000F Private Use, First>",
1467--   direction="l",
1468--   unicodeslot=0xF0000,
1469--  },
1470--  [0xFFFFD]={
1471--   category="co",
1472--   cjkwd="a",
1473--   description="<Plane 0x000F Private Use, Last>",
1474--   direction="l",
1475--   unicodeslot=0xFFFFD,
1476--  },
1477--  [0x100000]={
1478--   category="co",
1479--   cjkwd="a",
1480--   description="<Plane 0x0010 Private Use, First>",
1481--   direction="l",
1482--   unicodeslot=0x100000,
1483--  },
1484--  [0x10FFFD]={
1485--   category="co",
1486--   cjkwd="a",
1487--   description="<Plane 0x0010 Private Use, Last>",
1488--   direction="l",
1489--   unicodeslot=0x10FFFD,
1490--  },
1491
1492if not characters.superscripts then
1493
1494    local superscripts = allocate()   characters.superscripts = superscripts
1495    local subscripts   = allocate()   characters.subscripts   = subscripts
1496    local fractions    = allocate()   characters.fractions    = fractions
1497
1498    -- skipping U+02120 (service mark) U+02122 (trademark)
1499
1500    for k, v in next, data do
1501        local specials = v.specials
1502        if specials then
1503            local what = specials[1]
1504            if what == "super" then
1505                if #specials == 2 then
1506                    superscripts[k] = specials[2]
1507                elseif trace_defining then
1508                    report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
1509                end
1510            elseif what == "sub" then
1511                if #specials == 2 then
1512                    subscripts[k] = specials[2]
1513                elseif trace_defining then
1514                    report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
1515                end
1516            elseif what == "fraction" then
1517                if #specials > 1 then
1518                    fractions[k] = { unpack(specials,2) }
1519                elseif trace_defining then
1520                    report_defining("ignoring %s %a, char %c, description %a","fraction",ustring(k),k,v.description)
1521                end
1522            end
1523        end
1524    end
1525
1526 -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
1527 -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
1528 -- print(table.serialize(fractions,    "fractions",    { hexify = true }))
1529
1530    if storage then
1531        storage.register("characters/superscripts", superscripts, "characters.superscripts")
1532        storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1533        storage.register("characters/fractions",    fractions,    "characters.fractions")
1534    end
1535
1536end
1537
1538function characters.showstring(str)
1539    local list = utotable(str)
1540    for i=1,#list do
1541        report_defining("split % 3i : %C",i,list[i])
1542    end
1543end
1544
1545do
1546
1547    -- There is no need to preload this table.
1548
1549    local any       = P(1)
1550    local special   = S([['".,:;-+()]])
1551                    + P('') + P('')
1552    local apostrofe = P("") + P("'")
1553
1554    local pattern = Cs ( (
1555        (P("medium light") / "medium-light" + P("medium dark")  / "medium-dark") * P(" skin tone")
1556        + (apostrofe * P("s"))/""
1557        + special/""
1558        + any
1559    )^1)
1560
1561    local function load()
1562        local name = resolvers.findfile("char-emj.lua")
1563        local data = name and name ~= "" and dofile(name) or { }
1564        local hash = { }
1565        for d, c in next, data do
1566            local k = lpegmatch(pattern,d) or d
1567            local u = { }
1568            for i=1,#c do
1569                u[i] = utfchar(c[i])
1570            end
1571            u = concat(u)
1572            hash[k] = u
1573        end
1574        return data, hash
1575    end
1576
1577    local data, hash = nil, nil
1578
1579    function characters.emojized(name)
1580        local t = lpegmatch(pattern,name)
1581        if t then
1582            return t
1583        else
1584            return { name }
1585        end
1586    end
1587
1588    local start     = P(" ")
1589    local finish    = P(-1) + P(" ")
1590    local skintone  = P("medium ")^0 * (P("light ") + P("dark "))^0 * P("skin tone")
1591    local gender    = P("woman") + P("man")
1592    local expanded  = (
1593                            P("m-l-")/"medium-light"
1594                          + P("m-d-")/"medium-dark"
1595                          + P("l-")  /"light"
1596                          + P("m-")  /"medium"
1597                          + P("d-")  /"dark"
1598                      )
1599                    * (P("s-t")/" skin tone")
1600    local compacted = (
1601                        (P("medium-")/"m-" * (P("light")/"l" + P("dark")/"d"))
1602                      + (P("medium")/"m"   +  P("light")/"l" + P("dark")/"d")
1603                      )
1604                    * (P(" skin tone")/"-s-t")
1605
1606    local pattern_0 = Cs((expanded + any)^1)
1607    local pattern_1 = Cs(((start * skintone + skintone * finish)/"" + any)^1)
1608    local pattern_2 = Cs(((start * gender   + gender   * finish)/"" + any)^1)
1609    local pattern_4 = Cs((compacted + any)^1)
1610
1611 -- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t"))
1612 -- print(lpegmatch(pattern_0,"something m-l-s-t"))
1613 -- print(lpegmatch(pattern_0,"something m-s-t"))
1614 -- print(lpegmatch(pattern_4,"something medium-light skin tone"))
1615 -- print(lpegmatch(pattern_4,"something medium skin tone"))
1616
1617    local skin =
1618        P("light skin tone")        / utfchar(0x1F3FB)
1619      + P("medium-light skin tone") / utfchar(0x1F3FC)
1620      + P("medium skin tone")       / utfchar(0x1F3FD)
1621      + P("medium-dark skin tone")  / utfchar(0x1F3FE)
1622      + P("dark skin tone")         / utfchar(0x1F3FF)
1623
1624    local parent =
1625        P("man")   / utfchar(0x1F468)
1626      + P("woman") / utfchar(0x1F469)
1627
1628    local child =
1629        P("baby")  / utfchar(0x1F476)
1630      + P("boy")   / utfchar(0x1F466)
1631      + P("girl")  / utfchar(0x1F467)
1632
1633    local zwj   = utfchar(0x200D)
1634    local heart = utfchar(0x2764) .. utfchar(0xFE0F) .. zwj
1635    local kiss  = utfchar(0x2764) .. utfchar(0xFE0F) .. utfchar(0x200D) .. utfchar(0x1F48B) .. zwj
1636
1637    ----- member = parent + child
1638
1639    local space = P(" ")
1640    local final = P(-1)
1641
1642    local p_done   = (space^1/zwj) + P(-1)
1643    local p_rest   = space/"" * (skin * p_done) + p_done
1644    local p_parent = parent * p_rest
1645    local p_child  = child  * p_rest
1646
1647    local p_family = Cs ( (P("family")            * space^1)/"" * p_parent^-2 * p_child^-2 )
1648    local p_couple = Cs ( (P("couple with heart") * space^1)/"" * p_parent * Cc(heart) * p_parent )
1649    local p_kiss   = Cs ( (P("kiss")              * space^1)/"" * p_parent * Cc(kiss)  * p_parent )
1650
1651    local p_special = p_family + p_couple + p_kiss
1652
1653 -- print(lpeg.match(p_special,"family man woman girl"))
1654 -- print(lpeg.match(p_special,"family man dark skin tone woman girl girl"))
1655
1656 -- local p_special = P { "all",
1657 --     all    = Cs (V("family") + V("couple") + V("kiss")),
1658 --     family = C("family")            * space^1 * V("parent")^-2 * V("child")^-2,
1659 --     couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"),
1660 --     kiss   = P("kiss")              * space^1 * V("parent") * Cc(kiss) * V("parent"),
1661 --     parent = parent * V("rest"),
1662 --     child  = child  * V("rest"),
1663 --     rest   = (space * skin)^0/"" * ((space^1/zwj) + P(-1)),
1664 -- }
1665
1666    local emoji      = { }
1667    characters.emoji = emoji
1668
1669local cache = setmetatable({ }, { __mode = "k" } )
1670
1671    function emoji.resolve(name)
1672        if not hash then
1673            data, hash = load()
1674        end
1675        local h = hash[name]
1676        if h then
1677            return h
1678        end
1679        local h = cache[name]
1680        if h then
1681            return h
1682        elseif h == false then
1683            return
1684        end
1685        -- expand shortcuts
1686        local name = lpegmatch(pattern_0,name) or name
1687        -- expand some 25K variants
1688        local h = lpegmatch(p_special,name)
1689        if h then
1690            cache[name] = h
1691            return h
1692        end
1693        -- simplify
1694        local s = lpegmatch(pattern_1,name)
1695        local h = hash[s]
1696        if h then
1697            cache[name] = h
1698            return h
1699        end
1700        -- simplify
1701        local s = lpegmatch(pattern_2,name)
1702        local h = hash[s]
1703        if h then
1704            cache[name] = h
1705            return h
1706        end
1707        cache[name] = false
1708    end
1709
1710    function emoji.known()
1711        if not hash then
1712            data, hash = load()
1713        end
1714        return hash, data
1715    end
1716
1717    function emoji.compact(name)
1718        return lpegmatch(pattern_4,name) or name
1719    end
1720
1721end
1722
1723-- code moved to char-tex.lua
1724
1725return characters
1726