char-ini.lua /size: 98 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['char-ini'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- todo: make two files, one for format generation, one for format use
10-- todo: move some to char-utf
11
12-- we can remove the tag range starting at 0xE0000 (special applications)
13
14local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
15local concat, unpack, tohash, insert = table.concat, table.unpack, table.tohash, table.insert
16local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
17local format, lower, gsub, find = string.format, string.lower, string.gsub, string.find
18local P, R, S, C, Cs, Ct, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.Cc, lpeg.V
19local formatters = string.formatters
20
21if not characters then require("char-def") end
22
23local lpegpatterns          = lpeg.patterns
24local lpegmatch             = lpeg.match
25local utf8byte              = lpegpatterns.utf8byte
26local utf8character         = lpegpatterns.utf8character
27
28local utfchartabletopattern = lpeg.utfchartabletopattern
29
30local allocate              = utilities.storage.allocate
31local mark                  = utilities.storage.mark
32
33local setmetatableindex     = table.setmetatableindex
34
35local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
36
37local report_defining       = logs.reporter("characters")
38
39-- This module implements some methods and creates additional datastructured from
40-- the big character table that we use for all kind of purposes: 'char-def.lua'.
41--
42-- We assume that at this point 'characters.data' is already populated!
43--
44-- todo: in 'char-def.lua' assume defaults:
45--
46--   directions = l
47--   cjkwd      = a
48--   linebreak  = al
49
50characters       = characters or { }
51local characters = characters
52local data       = characters.data
53
54if data then
55    mark(data) -- why does this fail
56else
57    report_defining("fatal error: 'char-def.lua' is not loaded")
58    os.exit()
59end
60
61-- Extending the table.
62
63if context and CONTEXTLMTXMODE == 0 then
64
65    if not characters.private then
66
67        require("char-prv")
68
69        if storage then
70            storage.register("characters/private", characters.private, "characters.private")
71        end
72
73    end
74
75    for unicode, d in next, characters.private do
76        data[unicode] = d
77    end
78
79end
80
81-- This converts a string (if given) into a number.
82
83local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
84
85lpegpatterns.chartonumber = pattern
86
87local function chartonumber(k)
88    if type(k) == "string" then
89        local u = lpegmatch(pattern,k)
90        if u then
91            return utfbyte(u)
92        else
93            return utfbyte(k) or 0
94        end
95    else
96        return k or 0
97    end
98end
99
100local function charfromnumber(k)
101    if type(k) == "number" then
102        return utfchar(k) or ""
103    else
104        local u = lpegmatch(pattern,k)
105        if u then
106            return utfchar(u)
107        else
108            return k
109        end
110    end
111end
112
113--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
114
115characters.tonumber   = chartonumber
116characters.fromnumber = charfromnumber
117
118local private = {
119    description = "PRIVATE SLOT",
120}
121
122local ranges      = allocate()
123characters.ranges = ranges
124
125setmetatableindex(data, function(t,k)
126    local tk = type(k)
127    if tk == "string" then
128        k = lpegmatch(pattern,k) or utfbyte(k)
129        if k then
130            local v = rawget(t,k)
131            if v then
132                return v
133            else
134                tk = "number" -- fall through to range
135            end
136        else
137            return private
138        end
139    end
140    if tk == "number" and k < 0xF0000 then
141        for r=1,#ranges do
142            local rr = ranges[r]
143            if k >= rr.first and k <= rr.last then
144                local extender = rr.extender
145                if extender then
146                    local v = extender(k)
147                    t[k] = v
148                    return v
149                end
150            end
151        end
152    end
153    return private -- handy for when we loop over characters in fonts and check for a property
154end)
155
156local variant_selector_metatable = {
157    category  = "mn",
158    cjkwd     = "a",
159    direction = "nsm",
160    linebreak = "cm",
161}
162
163-- This saves a bit of memory and also serves as example.
164
165local f_variant = string.formatters["VARIATION SELECTOR-0x%04X"]
166
167insert(characters.ranges,{
168    first    = 0xFE00,
169    last     = 0xFE0F,
170    name     = "variant selector",
171    extender = function(k)
172        local t = {
173            description = f_variant(k - 0xFE00 + 0x0001),
174            unicodeslot = k,
175        }
176        setmetatable(t,variant_selector_metatable)
177        return t
178    end,
179})
180
181insert(characters.ranges,{
182    first    = 0xE0100,
183    last     = 0xE01EF,
184    name     = "variant selector extension",
185    extender = function(k)
186        local t = {
187            description = f_variant(k - 0xE0100 + 0x0011),
188            unicodeslot = k,
189        }
190        setmetatable(t,variant_selector_metatable)
191        return t
192    end,
193})
194
195local blocks = allocate {
196    ["adlam"]                                       = { first = 0x1E900, last = 0x1E95F,              description = "Adlam" },
197    ["aegeannumbers"]                               = { first = 0x10100, last = 0x1013F,              description = "Aegean Numbers" },
198    ["ahom"]                                        = { first = 0x11700, last = 0x1174F,              description = "Ahom" },
199    ["alchemicalsymbols"]                           = { first = 0x1F700, last = 0x1F77F,              description = "Alchemical Symbols" },
200    ["alphabeticpresentationforms"]                 = { first = 0x0FB00, last = 0x0FB4F, otf="latn",  description = "Alphabetic Presentation Forms" },
201    ["anatolianhieroglyphs"]                        = { first = 0x14400, last = 0x1467F,              description = "Anatolian Hieroglyphs" },
202    ["ancientgreekmusicalnotation"]                 = { first = 0x1D200, last = 0x1D24F, otf="grek",  description = "Ancient Greek Musical Notation" },
203    ["ancientgreeknumbers"]                         = { first = 0x10140, last = 0x1018F, otf="grek",  description = "Ancient Greek Numbers" },
204    ["ancientsymbols"]                              = { first = 0x10190, last = 0x101CF, otf="grek",  description = "Ancient Symbols" },
205    ["arabic"]                                      = { first = 0x00600, last = 0x006FF, otf="arab",  description = "Arabic" },
206    ["arabicextendeda"]                             = { first = 0x008A0, last = 0x008FF,              description = "Arabic Extended-A" },
207    ["arabicextendedb"]                             = { first = 0x00870, last = 0x0089F,              description = "Arabic Extended-B" },
208    ["arabicextendedc"]                             = { first = 0x10EC0, last = 0x10EFF,              description = "Arabic Extended-C" },
209    ["arabicmathematicalalphabeticsymbols"]         = { first = 0x1EE00, last = 0x1EEFF,              description = "Arabic Mathematical Alphabetic Symbols" },
210    ["arabicpresentationformsa"]                    = { first = 0x0FB50, last = 0x0FDFF, otf="arab",  description = "Arabic Presentation Forms-A" },
211    ["arabicpresentationformsb"]                    = { first = 0x0FE70, last = 0x0FEFF, otf="arab",  description = "Arabic Presentation Forms-B" },
212    ["arabicsupplement"]                            = { first = 0x00750, last = 0x0077F, otf="arab",  description = "Arabic Supplement" },
213    ["armenian"]                                    = { first = 0x00530, last = 0x0058F, otf="armn",  description = "Armenian" },
214    ["arrows"]                                      = { first = 0x02190, last = 0x021FF,              description = "Arrows" },
215    ["avestan"]                                     = { first = 0x10B00, last = 0x10B3F,              description = "Avestan" },
216    ["balinese"]                                    = { first = 0x01B00, last = 0x01B7F, otf="bali",  description = "Balinese" },
217    ["bamum"]                                       = { first = 0x0A6A0, last = 0x0A6FF,              description = "Bamum" },
218    ["bamumsupplement"]                             = { first = 0x16800, last = 0x16A3F,              description = "Bamum Supplement" },
219    ["basiclatin"]                                  = { first = 0x00000, last = 0x0007F, otf="latn",  description = "Basic Latin" },
220    ["bassavah"]                                    = { first = 0x16AD0, last = 0x16AFF,              description = "Bassa Vah" },
221    ["batak"]                                       = { first = 0x01BC0, last = 0x01BFF,              description = "Batak" },
222    ["bengali"]                                     = { first = 0x00980, last = 0x009FF, otf="beng",  description = "Bengali" },
223    ["bhaiksuki"]                                   = { first = 0x11C00, last = 0x11C6F,              description = "Bhaiksuki" },
224    ["blockelements"]                               = { first = 0x02580, last = 0x0259F, otf="bopo",  description = "Block Elements" },
225    ["bopomofo"]                                    = { first = 0x03100, last = 0x0312F, otf="bopo",  description = "Bopomofo" },
226    ["bopomofoextended"]                            = { first = 0x031A0, last = 0x031BF, otf="bopo",  description = "Bopomofo Extended" },
227    ["boxdrawing"]                                  = { first = 0x02500, last = 0x0257F,              description = "Box Drawing" },
228    ["brahmi"]                                      = { first = 0x11000, last = 0x1107F,              description = "Brahmi" },
229    ["braillepatterns"]                             = { first = 0x02800, last = 0x028FF, otf="brai",  description = "Braille Patterns" },
230    ["buginese"]                                    = { first = 0x01A00, last = 0x01A1F, otf="bugi",  description = "Buginese" },
231    ["buhid"]                                       = { first = 0x01740, last = 0x0175F, otf="buhd",  description = "Buhid" },
232    ["byzantinemusicalsymbols"]                     = { first = 0x1D000, last = 0x1D0FF, otf="byzm",  description = "Byzantine Musical Symbols" },
233    ["carian"]                                      = { first = 0x102A0, last = 0x102DF,              description = "Carian" },
234    ["caucasianalbanian"]                           = { first = 0x10530, last = 0x1056F,              description = "Caucasian Albanian" },
235    ["chakma"]                                      = { first = 0x11100, last = 0x1114F,              description = "Chakma" },
236    ["cham"]                                        = { first = 0x0AA00, last = 0x0AA5F,              description = "Cham" },
237    ["cherokee"]                                    = { first = 0x013A0, last = 0x013FF, otf="cher",  description = "Cherokee" },
238    ["cherokeesupplement"]                          = { first = 0x0AB70, last = 0x0ABBF,              description = "Cherokee Supplement" },
239    ["chesssymbols"]                                = { first = 0x1FA00, last = 0x1FA6F,              description = "Chess Symbols" },
240    ["chorasmian"]                                  = { first = 0x10FB0, last = 0x10FDF,              description = "Chorasmian" },
241    ["cjkcompatibility"]                            = { first = 0x03300, last = 0x033FF, otf="hang",  description = "CJK Compatibility" },
242    ["cjkcompatibilityforms"]                       = { first = 0x0FE30, last = 0x0FE4F, otf="hang",  description = "CJK Compatibility Forms" },
243    ["cjkcompatibilityideographs"]                  = { first = 0x0F900, last = 0x0FAFF, otf="hang",  description = "CJK Compatibility Ideographs" },
244    ["cjkcompatibilityideographssupplement"]        = { first = 0x2F800, last = 0x2FA1F, otf="hang",  description = "CJK Compatibility Ideographs Supplement" },
245    ["cjkradicalssupplement"]                       = { first = 0x02E80, last = 0x02EFF, otf="hang",  description = "CJK Radicals Supplement" },
246    ["cjkstrokes"]                                  = { first = 0x031C0, last = 0x031EF, otf="hang",  description = "CJK Strokes" },
247    ["cjksymbolsandpunctuation"]                    = { first = 0x03000, last = 0x0303F, otf="hang",  description = "CJK Symbols and Punctuation" },
248    ["cjkunifiedideographs"]                        = { first = 0x04E00, last = 0x09FFF, otf="hang",  description = "CJK Unified Ideographs", catcode = "letter" },
249    ["cjkunifiedideographsextensiona"]              = { first = 0x03400, last = 0x04DBF, otf="hang",  description = "CJK Unified Ideographs Extension A" },
250    ["cjkunifiedideographsextensionb"]              = { first = 0x20000, last = 0x2A6DF, otf="hang",  description = "CJK Unified Ideographs Extension B" },
251    ["cjkunifiedideographsextensionc"]              = { first = 0x2A700, last = 0x2B73F,              description = "CJK Unified Ideographs Extension C" },
252    ["cjkunifiedideographsextensiond"]              = { first = 0x2B740, last = 0x2B81F,              description = "CJK Unified Ideographs Extension D" },
253    ["cjkunifiedideographsextensione"]              = { first = 0x2B820, last = 0x2CEAF,              description = "CJK Unified Ideographs Extension E" },
254    ["cjkunifiedideographsextensionf"]              = { first = 0x2CEB0, last = 0x2EBEF,              description = "CJK Unified Ideographs Extension F" },
255    ["cjkunifiedideographsextensiong"]              = { first = 0x30000, last = 0x3134F,              description = "CJK Unified Ideographs Extension G" },
256    ["cjkunifiedideographsextensionh"]              = { first = 0x31350, last = 0x323AF,              description = "CJK Unified Ideographs Extension H" },
257    ["cjkunifiedideographsextensioni"]              = { first = 0x2EBF0, last = 0x2EE5F,              description = "CJK Unified Ideographs Extension I" },
258    ["combiningdiacriticalmarks"]                   = { first = 0x00300, last = 0x0036F,              description = "Combining Diacritical Marks" },
259    ["combiningdiacriticalmarksextended"]           = { first = 0x01AB0, last = 0x01AFF,              description = "Combining Diacritical Marks Extended" },
260    ["combiningdiacriticalmarksforsymbols"]         = { first = 0x020D0, last = 0x020FF,              description = "Combining Diacritical Marks for Symbols" },
261    ["combiningdiacriticalmarkssupplement"]         = { first = 0x01DC0, last = 0x01DFF,              description = "Combining Diacritical Marks Supplement" },
262    ["combininghalfmarks"]                          = { first = 0x0FE20, last = 0x0FE2F,              description = "Combining Half Marks" },
263    ["commonindicnumberforms"]                      = { first = 0x0A830, last = 0x0A83F,              description = "Common Indic Number Forms" },
264    ["controlpictures"]                             = { first = 0x02400, last = 0x0243F,              description = "Control Pictures" },
265    ["coptic"]                                      = { first = 0x02C80, last = 0x02CFF, otf="copt",  description = "Coptic" },
266    ["copticepactnumbers"]                          = { first = 0x102E0, last = 0x102FF,              description = "Coptic Epact Numbers" },
267    ["countingrodnumerals"]                         = { first = 0x1D360, last = 0x1D37F,              description = "Counting Rod Numerals" },
268    ["cuneiform"]                                   = { first = 0x12000, last = 0x123FF, otf="xsux",  description = "Cuneiform" },
269    ["cuneiformnumbersandpunctuation"]              = { first = 0x12400, last = 0x1247F, otf="xsux",  description = "Cuneiform Numbers and Punctuation" },
270    ["currencysymbols"]                             = { first = 0x020A0, last = 0x020CF,              description = "Currency Symbols" },
271    ["cypriotsyllabary"]                            = { first = 0x10800, last = 0x1083F, otf="cprt",  description = "Cypriot Syllabary" },
272    ["cyprominoan"]                                 = { first = 0x12F90, last = 0x12FFF,              description = "Cypro-Minoan" },
273    ["cyrillic"]                                    = { first = 0x00400, last = 0x004FF, otf="cyrl",  description = "Cyrillic" },
274    ["cyrillicextendeda"]                           = { first = 0x02DE0, last = 0x02DFF, otf="cyrl",  description = "Cyrillic Extended-A" },
275    ["cyrillicextendedb"]                           = { first = 0x0A640, last = 0x0A69F, otf="cyrl",  description = "Cyrillic Extended-B" },
276    ["cyrillicextendedc"]                           = { first = 0x01C80, last = 0x01C8F,              description = "Cyrillic Extended-C" },
277    ["cyrillicextendedd"]                           = { first = 0x1E030, last = 0x1E08F,              description = "Cyrillic Extended-D" },
278    ["cyrillicsupplement"]                          = { first = 0x00500, last = 0x0052F, otf="cyrl",  description = "Cyrillic Supplement" },
279    ["deseret"]                                     = { first = 0x10400, last = 0x1044F, otf="dsrt",  description = "Deseret" },
280    ["devanagari"]                                  = { first = 0x00900, last = 0x0097F, otf="deva",  description = "Devanagari" },
281    ["devanagariextended"]                          = { first = 0x0A8E0, last = 0x0A8FF,              description = "Devanagari Extended" },
282    ["devanagariextendeda"]                         = { first = 0x11B00, last = 0x11B5F,              description = "Devanagari Extended-A" },
283    ["digitsarabicindic"]                           = { first = 0x00660, last = 0x00669, math = true },
284 -- ["digitsbengali"]                               = { first = 0x009E6, last = 0x009EF, math = true },
285    ["digitsbold"]                                  = { first = 0x1D7CE, last = 0x1D7D7, math = true },
286 -- ["digitsdevanagari"]                            = { first = 0x00966, last = 0x0096F, math = true },
287    ["digitsdoublestruck"]                          = { first = 0x1D7D8, last = 0x1D7E1, math = true },
288 -- ["digitsethiopic"]                              = { first = 0x01369, last = 0x01371, math = true },
289    ["digitsextendedarabicindic"]                   = { first = 0x006F0, last = 0x006F9, math = true },
290 -- ["digitsgujarati"]                              = { first = 0x00AE6, last = 0x00AEF, math = true },
291 -- ["digitsgurmukhi"]                              = { first = 0x00A66, last = 0x00A6F, math = true },
292 -- ["digitskannada"]                               = { first = 0x00CE6, last = 0x00CEF, math = true },
293 -- ["digitskhmer"]                                 = { first = 0x017E0, last = 0x017E9, math = true },
294 -- ["digitslao"]                                   = { first = 0x00ED0, last = 0x00ED9, math = true },
295    ["digitslatin"]                                 = { first = 0x00030, last = 0x00039, math = true },
296 -- ["digitsmalayalam"]                             = { first = 0x00D66, last = 0x00D6F, math = true },
297 -- ["digitsmongolian"]                             = { first = 0x01810, last = 0x01809, math = true },
298    ["digitsmonospace"]                             = { first = 0x1D7F6, last = 0x1D7FF, math = true },
299 -- ["digitsmyanmar"]                               = { first = 0x01040, last = 0x01049, math = true },
300    ["digitsnormal"]                                = { first = 0x00030, last = 0x00039, math = true },
301 -- ["digitsoriya"]                                 = { first = 0x00B66, last = 0x00B6F, math = true },
302    ["digitssansserifbold"]                         = { first = 0x1D7EC, last = 0x1D7F5, math = true },
303    ["digitssansserifnormal"]                       = { first = 0x1D7E2, last = 0x1D7EB, math = true },
304 -- ["digitstamil"]                                 = { first = 0x00030, last = 0x00039, math = true }, -- no zero
305 -- ["digitstelugu"]                                = { first = 0x00C66, last = 0x00C6F, math = true },
306 -- ["digitsthai"]                                  = { first = 0x00E50, last = 0x00E59, math = true },
307 -- ["digitstibetan"]                               = { first = 0x00F20, last = 0x00F29, math = true },
308    ["dingbats"]                                    = { first = 0x02700, last = 0x027BF,              description = "Dingbats" },
309    ["divesakuru"]                                  = { first = 0x11900, last = 0x1195F,              description = "Dives Akuru" },
310    ["dogra"]                                       = { first = 0x11800, last = 0x1184F,              description = "Dogra" },
311    ["dominotiles"]                                 = { first = 0x1F030, last = 0x1F09F,              description = "Domino Tiles" },
312    ["duployan"]                                    = { first = 0x1BC00, last = 0x1BC9F,              description = "Duployan" },
313    ["earlydynasticcuneiform"]                      = { first = 0x12480, last = 0x1254F,              description = "Early Dynastic Cuneiform" },
314    ["egyptianhieroglyphformatcontrols"]            = { first = 0x13430, last = 0x1345F,              description = "Egyptian Hieroglyph Format Controls" },
315    ["egyptianhieroglyphs"]                         = { first = 0x13000, last = 0x1342F,              description = "Egyptian Hieroglyphs" },
316    ["elbasan"]                                     = { first = 0x10500, last = 0x1052F,              description = "Elbasan" },
317    ["elymaic"]                                     = { first = 0x10FE0, last = 0x10FFF,              description = "Elymaic" },
318    ["emoticons"]                                   = { first = 0x1F600, last = 0x1F64F,              description = "Emoticons" },
319    ["enclosedalphanumerics"]                       = { first = 0x02460, last = 0x024FF,              description = "Enclosed Alphanumerics" },
320    ["enclosedalphanumericsupplement"]              = { first = 0x1F100, last = 0x1F1FF,              description = "Enclosed Alphanumeric Supplement" },
321    ["enclosedcjklettersandmonths"]                 = { first = 0x03200, last = 0x032FF,              description = "Enclosed CJK Letters and Months" },
322    ["enclosedideographicsupplement"]               = { first = 0x1F200, last = 0x1F2FF,              description = "Enclosed Ideographic Supplement" },
323    ["ethiopic"]                                    = { first = 0x01200, last = 0x0137F, otf="ethi",  description = "Ethiopic" },
324    ["ethiopicextended"]                            = { first = 0x02D80, last = 0x02DDF, otf="ethi",  description = "Ethiopic Extended" },
325    ["ethiopicextendeda"]                           = { first = 0x0AB00, last = 0x0AB2F,              description = "Ethiopic Extended-A" },
326    ["ethiopicextendedb"]                           = { first = 0x1E7E0, last = 0x1E7FF,              description = "Ethiopic Extended-B" },
327    ["ethiopicsupplement"]                          = { first = 0x01380, last = 0x0139F, otf="ethi",  description = "Ethiopic Supplement" },
328    ["generalpunctuation"]                          = { first = 0x02000, last = 0x0206F,              description = "General Punctuation" },
329    ["geometricshapes"]                             = { first = 0x025A0, last = 0x025FF, math = true, description = "Geometric Shapes" },
330    ["geometricshapesextended"]                     = { first = 0x1F780, last = 0x1F7FF,              description = "Geometric Shapes Extended" },
331    ["georgian"]                                    = { first = 0x010A0, last = 0x010FF, otf="geor",  description = "Georgian" },
332    ["georgianextended"]                            = { first = 0x01C90, last = 0x01CBF,              description = "Georgian Extended" },
333    ["georgiansupplement"]                          = { first = 0x02D00, last = 0x02D2F, otf="geor",  description = "Georgian Supplement" },
334    ["glagolitic"]                                  = { first = 0x02C00, last = 0x02C5F, otf="glag",  description = "Glagolitic" },
335    ["glagoliticsupplement"]                        = { first = 0x1E000, last = 0x1E02F,              description = "Glagolitic Supplement" },
336    ["gothic"]                                      = { first = 0x10330, last = 0x1034F, otf="goth",  description = "Gothic" },
337    ["grantha"]                                     = { first = 0x11300, last = 0x1137F,              description = "Grantha" },
338    ["greekandcoptic"]                              = { first = 0x00370, last = 0x003FF, otf="grek",  description = "Greek and Coptic" },
339    ["greekextended"]                               = { first = 0x01F00, last = 0x01FFF, otf="grek",  description = "Greek Extended" },
340    ["gujarati"]                                    = { first = 0x00A80, last = 0x00AFF, otf="gujr",  description = "Gujarati" },
341    ["gunjalagondi"]                                = { first = 0x11D60, last = 0x11DAF,              description = "Gunjala Gondi" },
342    ["gurmukhi"]                                    = { first = 0x00A00, last = 0x00A7F, otf="guru",  description = "Gurmukhi" },
343    ["halfwidthandfullwidthforms"]                  = { first = 0x0FF00, last = 0x0FFEF,              description = "Halfwidth and Fullwidth Forms" },
344    ["hangulcompatibilityjamo"]                     = { first = 0x03130, last = 0x0318F, otf="jamo",  description = "Hangul Compatibility Jamo" },
345    ["hanguljamo"]                                  = { first = 0x01100, last = 0x011FF, otf="jamo",  description = "Hangul Jamo" },
346    ["hanguljamoextendeda"]                         = { first = 0x0A960, last = 0x0A97F,              description = "Hangul Jamo Extended-A" },
347    ["hanguljamoextendedb"]                         = { first = 0x0D7B0, last = 0x0D7FF,              description = "Hangul Jamo Extended-B" },
348    ["hangulsyllables"]                             = { first = 0x0AC00, last = 0x0D7AF, otf="hang",  description = "Hangul Syllables" },
349    ["hanifirohingya"]                              = { first = 0x10D00, last = 0x10D3F,              description = "Hanifi Rohingya" },
350    ["hanunoo"]                                     = { first = 0x01720, last = 0x0173F, otf="hano",  description = "Hanunoo" },
351    ["hatran"]                                      = { first = 0x108E0, last = 0x108FF,              description = "Hatran" },
352    ["hebrew"]                                      = { first = 0x00590, last = 0x005FF, otf="hebr",  description = "Hebrew" },
353    ["highprivateusesurrogates"]                    = { first = 0x0DB80, last = 0x0DBFF,              description = "High Private Use Surrogates" },
354    ["highsurrogates"]                              = { first = 0x0D800, last = 0x0DB7F,              description = "High Surrogates" },
355    ["hiragana"]                                    = { first = 0x03040, last = 0x0309F, otf="kana",  description = "Hiragana" },
356    ["ideographicdescriptioncharacters"]            = { first = 0x02FF0, last = 0x02FFF,              description = "Ideographic Description Characters" },
357    ["ideographicsymbolsandpunctuation"]            = { first = 0x16FE0, last = 0x16FFF,              description = "Ideographic Symbols and Punctuation" },
358    ["imperialaramaic"]                             = { first = 0x10840, last = 0x1085F,              description = "Imperial Aramaic" },
359    ["indicsiyaqnumbers"]                           = { first = 0x1EC70, last = 0x1ECBF,              description = "Indic Siyaq Numbers" },
360    ["inscriptionalpahlavi"]                        = { first = 0x10B60, last = 0x10B7F,              description = "Inscriptional Pahlavi" },
361    ["inscriptionalparthian"]                       = { first = 0x10B40, last = 0x10B5F,              description = "Inscriptional Parthian" },
362    ["ipaextensions"]                               = { first = 0x00250, last = 0x002AF,              description = "IPA Extensions" },
363    ["javanese"]                                    = { first = 0x0A980, last = 0x0A9DF,              description = "Javanese" },
364    ["kaithi"]                                      = { first = 0x11080, last = 0x110CF,              description = "Kaithi" },
365    ["kaktoviknumerals"]                            = { first = 0x1D2C0, last = 0x1D2DF,              description = "Kaktovik Numerals" },
366    ["kanaextendeda"]                               = { first = 0x1B100, last = 0x1B12F,              description = "Kana Extended-A" },
367    ["kanaextendedb"]                               = { first = 0x1AFF0, last = 0x1AFFF,              description = "Kana Extended-B" },
368    ["kanasupplement"]                              = { first = 0x1B000, last = 0x1B0FF,              description = "Kana Supplement" },
369    ["kanbun"]                                      = { first = 0x03190, last = 0x0319F,              description = "Kanbun" },
370    ["kangxiradicals"]                              = { first = 0x02F00, last = 0x02FDF,              description = "Kangxi Radicals" },
371    ["kannada"]                                     = { first = 0x00C80, last = 0x00CFF, otf="knda",  description = "Kannada" },
372    ["katakana"]                                    = { first = 0x030A0, last = 0x030FF, otf="kana",  description = "Katakana" },
373    ["katakanaphoneticextensions"]                  = { first = 0x031F0, last = 0x031FF, otf="kana",  description = "Katakana Phonetic Extensions" },
374    ["kayahli"]                                     = { first = 0x0A900, last = 0x0A92F,              description = "Kayah Li" },
375    ["kawi"]                                        = { first = 0x11F00, last = 0x11F5F,              description = "Kawi" },
376    ["kharoshthi"]                                  = { first = 0x10A00, last = 0x10A5F, otf="khar",  description = "Kharoshthi" },
377    ["khitansmallscript"]                           = { first = 0x18B00, last = 0x18CFF,              description = "Khitan Small Script" },
378    ["khmer"]                                       = { first = 0x01780, last = 0x017FF, otf="khmr",  description = "Khmer" },
379    ["khmersymbols"]                                = { first = 0x019E0, last = 0x019FF, otf="khmr",  description = "Khmer Symbols" },
380    ["khojki"]                                      = { first = 0x11200, last = 0x1124F,              description = "Khojki" },
381    ["khudawadi"]                                   = { first = 0x112B0, last = 0x112FF,              description = "Khudawadi" },
382    ["lao"]                                         = { first = 0x00E80, last = 0x00EFF, otf="lao",   description = "Lao" },
383    ["latinextendeda"]                              = { first = 0x00100, last = 0x0017F, otf="latn",  description = "Latin Extended-A" },
384    ["latinextendedadditional"]                     = { first = 0x01E00, last = 0x01EFF, otf="latn",  description = "Latin Extended Additional" },
385    ["latinextendedb"]                              = { first = 0x00180, last = 0x0024F, otf="latn",  description = "Latin Extended-B" },
386    ["latinextendedc"]                              = { first = 0x02C60, last = 0x02C7F, otf="latn",  description = "Latin Extended-C" },
387    ["latinextendedd"]                              = { first = 0x0A720, last = 0x0A7FF, otf="latn",  description = "Latin Extended-D" },
388    ["latinextendede"]                              = { first = 0x0AB30, last = 0x0AB6F,              description = "Latin Extended-E" },
389    ["latinextendedf"]                              = { first = 0x10780, last = 0x107BF,              description = "Latin Extended-F" },
390    ["latinextendedg"]                              = { first = 0x1DF00, last = 0x1DFFF,              description = "Latin Extended-G" },
391    ["latinsupplement"]                             = { first = 0x00080, last = 0x000FF, otf="latn",  description = "Latin-1 Supplement" },
392    ["lepcha"]                                      = { first = 0x01C00, last = 0x01C4F,              description = "Lepcha" },
393    ["letterlikesymbols"]                           = { first = 0x02100, last = 0x0214F, math = true, description = "Letterlike Symbols" },
394    ["limbu"]                                       = { first = 0x01900, last = 0x0194F, otf="limb",  description = "Limbu" },
395    ["lineara"]                                     = { first = 0x10600, last = 0x1077F,              description = "Linear A" },
396    ["linearbideograms"]                            = { first = 0x10080, last = 0x100FF, otf="linb",  description = "Linear B Ideograms" },
397    ["linearbsyllabary"]                            = { first = 0x10000, last = 0x1007F, otf="linb",  description = "Linear B Syllabary" },
398    ["lisu"]                                        = { first = 0x0A4D0, last = 0x0A4FF,              description = "Lisu" },
399    ["lisusupplement"]                              = { first = 0x11FB0, last = 0x11FBF,              description = "Lisu Supplement" },
400    ["lowercasebold"]                               = { first = 0x1D41A, last = 0x1D433, math = true },
401    ["lowercaseboldfraktur"]                        = { first = 0x1D586, last = 0x1D59F, math = true },
402    ["lowercasebolditalic"]                         = { first = 0x1D482, last = 0x1D49B, math = true, italic = true },
403    ["lowercaseboldscript"]                         = { first = 0x1D4EA, last = 0x1D503, math = true, italic = true },
404    ["lowercasedoublestruck"]                       = { first = 0x1D552, last = 0x1D56B, math = true },
405    ["lowercasefraktur"]                            = { first = 0x1D51E, last = 0x1D537, math = true },
406    ["lowercasegreekbold"]                          = { first = 0x1D6C2, last = 0x1D6DB, math = true },
407    ["lowercasegreekbolditalic"]                    = { first = 0x1D736, last = 0x1D74F, math = true, italic = true },
408    ["lowercasegreekitalic"]                        = { first = 0x1D6FC, last = 0x1D715, math = true, italic = true },
409    ["lowercasegreeknormal"]                        = { first = 0x003B1, last = 0x003C9, math = true },
410    ["lowercasegreeksansserifbold"]                 = { first = 0x1D770, last = 0x1D789, math = true },
411    ["lowercasegreeksansserifbolditalic"]           = { first = 0x1D7AA, last = 0x1D7C3, math = true, italic = true },
412    ["lowercaseitalic"]                             = { first = 0x1D44E, last = 0x1D467, math = true, italic = true },
413    ["lowercasemonospace"]                          = { first = 0x1D68A, last = 0x1D6A3, math = true },
414    ["lowercasenormal"]                             = { first = 0x00061, last = 0x0007A, math = true },
415    ["lowercasesansserifbold"]                      = { first = 0x1D5EE, last = 0x1D607, math = true },
416    ["lowercasesansserifbolditalic"]                = { first = 0x1D656, last = 0x1D66F, math = true, italic = true },
417    ["lowercasesansserifitalic"]                    = { first = 0x1D622, last = 0x1D63B, math = true, italic = true },
418    ["lowercasesansserifnormal"]                    = { first = 0x1D5BA, last = 0x1D5D3, math = true },
419    ["lowercasescript"]                             = { first = 0x1D4B6, last = 0x1D4CF, math = true, italic = true },
420    ["lowsurrogates"]                               = { first = 0x0DC00, last = 0x0DFFF,              description = "Low Surrogates" },
421    ["lycian"]                                      = { first = 0x10280, last = 0x1029F,              description = "Lycian" },
422    ["lydian"]                                      = { first = 0x10920, last = 0x1093F,              description = "Lydian" },
423    ["mahajani"]                                    = { first = 0x11150, last = 0x1117F,              description = "Mahajani" },
424    ["mahjongtiles"]                                = { first = 0x1F000, last = 0x1F02F,              description = "Mahjong Tiles" },
425    ["makasar"]                                     = { first = 0x11EE0, last = 0x11EFF,              description = "Makasar" },
426    ["malayalam"]                                   = { first = 0x00D00, last = 0x00D7F, otf="mlym",  description = "Malayalam" },
427    ["mandaic"]                                     = { first = 0x00840, last = 0x0085F, otf="mand",  description = "Mandaic" },
428    ["manichaean"]                                  = { first = 0x10AC0, last = 0x10AFF,              description = "Manichaean" },
429    ["marchen"]                                     = { first = 0x11C70, last = 0x11CBF,              description = "Marchen" },
430    ["masaramgondi"]                                = { first = 0x11D00, last = 0x11D5F,              description = "Masaram Gondi" },
431    ["mathematicalalphanumericsymbols"]             = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" },
432    ["mathematicaloperators"]                       = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" },
433    ["mayannumerals"]                               = { first = 0x1D2E0, last = 0x1D2FF,              description = "Mayan Numerals" },
434    ["medefaidrin"]                                 = { first = 0x16E40, last = 0x16E9F,              description = "Medefaidrin" },
435    ["meeteimayek"]                                 = { first = 0x0ABC0, last = 0x0ABFF,              description = "Meetei Mayek" },
436    ["meeteimayekextensions"]                       = { first = 0x0AAE0, last = 0x0AAFF,              description = "Meetei Mayek Extensions" },
437    ["mendekikakui"]                                = { first = 0x1E800, last = 0x1E8DF,              description = "Mende Kikakui" },
438    ["meroiticcursive"]                             = { first = 0x109A0, last = 0x109FF,              description = "Meroitic Cursive" },
439    ["meroitichieroglyphs"]                         = { first = 0x10980, last = 0x1099F,              description = "Meroitic Hieroglyphs" },
440    ["miao"]                                        = { first = 0x16F00, last = 0x16F9F,              description = "Miao" },
441    ["miscellaneousmathematicalsymbolsa"]           = { first = 0x027C0, last = 0x027EF, math = true, description = "Miscellaneous Mathematical Symbols-A" },
442    ["miscellaneousmathematicalsymbolsb"]           = { first = 0x02980, last = 0x029FF, math = true, description = "Miscellaneous Mathematical Symbols-B" },
443    ["miscellaneoussymbols"]                        = { first = 0x02600, last = 0x026FF, math = true, description = "Miscellaneous Symbols" },
444    ["miscellaneoussymbolsandarrows"]               = { first = 0x02B00, last = 0x02BFF, math = true, description = "Miscellaneous Symbols and Arrows" },
445    ["miscellaneoussymbolsandpictographs"]          = { first = 0x1F300, last = 0x1F5FF,              description = "Miscellaneous Symbols and Pictographs" },
446    ["miscellaneoustechnical"]                      = { first = 0x02300, last = 0x023FF, math = true, description = "Miscellaneous Technical" },
447    ["modi"]                                        = { first = 0x11600, last = 0x1165F,              description = "Modi" },
448    ["modifiertoneletters"]                         = { first = 0x0A700, last = 0x0A71F,              description = "Modifier Tone Letters" },
449    ["mongolian"]                                   = { first = 0x01800, last = 0x018AF, otf="mong",  description = "Mongolian" },
450    ["mongoliansupplement"]                         = { first = 0x11660, last = 0x1167F,              description = "Mongolian Supplement" },
451    ["mro"]                                         = { first = 0x16A40, last = 0x16A6F,              description = "Mro" },
452    ["multani"]                                     = { first = 0x11280, last = 0x112AF,              description = "Multani" },
453    ["musicalsymbols"]                              = { first = 0x1D100, last = 0x1D1FF, otf="musc",  description = "Musical Symbols" },
454    ["myanmar"]                                     = { first = 0x01000, last = 0x0109F, otf="mymr",  description = "Myanmar" },
455    ["myanmarextendeda"]                            = { first = 0x0AA60, last = 0x0AA7F,              description = "Myanmar Extended-A" },
456    ["myanmarextendedb"]                            = { first = 0x0A9E0, last = 0x0A9FF,              description = "Myanmar Extended-B" },
457    ["nabataean"]                                   = { first = 0x10880, last = 0x108AF,              description = "Nabataean" },
458    ["nagmundari"]                                  = { first = 0x1E4D0, last = 0x1E4FF,              description = "Nag Mundari" },
459    ["nandinagari"]                                 = { first = 0x119A0, last = 0x119FF,              description = "Nandinagari" },
460    ["newa"]                                        = { first = 0x11400, last = 0x1147F,              description = "Newa" },
461    ["newtailue"]                                   = { first = 0x01980, last = 0x019DF,              description = "New Tai Lue" },
462    ["nko"]                                         = { first = 0x007C0, last = 0x007FF, otf="nko",   description = "NKo" },
463    ["numberforms"]                                 = { first = 0x02150, last = 0x0218F,              description = "Number Forms" },
464    ["nushu"]                                       = { first = 0x1B170, last = 0x1B2FF,              description = "Nushu" },
465    ["nyiakengpuachuehmong"]                        = { first = 0x1E100, last = 0x1E14F,              description = "Nyiakeng Puachue Hmong" },
466    ["ogham"]                                       = { first = 0x01680, last = 0x0169F, otf="ogam",  description = "Ogham" },
467    ["olchiki"]                                     = { first = 0x01C50, last = 0x01C7F,              description = "Ol Chiki" },
468    ["oldhungarian"]                                = { first = 0x10C80, last = 0x10CFF,              description = "Old Hungarian" },
469    ["olditalic"]                                   = { first = 0x10300, last = 0x1032F, otf="ital",  description = "Old Italic" },
470    ["oldnortharabian"]                             = { first = 0x10A80, last = 0x10A9F,              description = "Old North Arabian" },
471    ["oldpermic"]                                   = { first = 0x10350, last = 0x1037F,              description = "Old Permic" },
472    ["oldpersian"]                                  = { first = 0x103A0, last = 0x103DF, otf="xpeo",  description = "Old Persian" },
473    ["oldsogdian"]                                  = { first = 0x10F00, last = 0x10F2F,              description = "Old Sogdian" },
474    ["oldsoutharabian"]                             = { first = 0x10A60, last = 0x10A7F,              description = "Old South Arabian" },
475    ["oldturkic"]                                   = { first = 0x10C00, last = 0x10C4F,              description = "Old Turkic" },
476    ["olduyghur"]                                   = { first = 0x10F70, last = 0x10FAF,              description = "Old Uyghur" },
477    ["opticalcharacterrecognition"]                 = { first = 0x02440, last = 0x0245F,              description = "Optical Character Recognition" },
478    ["oriya"]                                       = { first = 0x00B00, last = 0x00B7F, otf="orya",  description = "Oriya" },
479    ["ornamentaldingbats"]                          = { first = 0x1F650, last = 0x1F67F,              description = "Ornamental Dingbats" },
480    ["osage"]                                       = { first = 0x104B0, last = 0x104FF,              description = "Osage" },
481    ["osmanya"]                                     = { first = 0x10480, last = 0x104AF, otf="osma",  description = "Osmanya" },
482    ["ottomansiyaqnumbers"]                         = { first = 0x1ED00, last = 0x1ED4F,              description = "Ottoman Siyaq Numbers" },
483    ["pahawhhmong"]                                 = { first = 0x16B00, last = 0x16B8F,              description = "Pahawh Hmong" },
484    ["palmyrene"]                                   = { first = 0x10860, last = 0x1087F,              description = "Palmyrene" },
485    ["paucinhau"]                                   = { first = 0x11AC0, last = 0x11AFF,              description = "Pau Cin Hau" },
486    ["phagspa"]                                     = { first = 0x0A840, last = 0x0A87F, otf="phag",  description = "Phags-pa" },
487    ["phaistosdisc"]                                = { first = 0x101D0, last = 0x101FF,              description = "Phaistos Disc" },
488    ["phoenician"]                                  = { first = 0x10900, last = 0x1091F, otf="phnx",  description = "Phoenician" },
489    ["phoneticextensions"]                          = { first = 0x01D00, last = 0x01D7F,              description = "Phonetic Extensions" },
490    ["phoneticextensionssupplement"]                = { first = 0x01D80, last = 0x01DBF,              description = "Phonetic Extensions Supplement" },
491    ["playingcards"]                                = { first = 0x1F0A0, last = 0x1F0FF,              description = "Playing Cards" },
492    ["privateusearea"]                              = { first = 0x0E000, last = 0x0F8FF,              description = "Private Use Area" },
493    ["psalterpahlavi"]                              = { first = 0x10B80, last = 0x10BAF,              description = "Psalter Pahlavi" },
494    ["rejang"]                                      = { first = 0x0A930, last = 0x0A95F,              description = "Rejang" },
495    ["ruminumeralsymbols"]                          = { first = 0x10E60, last = 0x10E7F,              description = "Rumi Numeral Symbols" },
496    ["runic"]                                       = { first = 0x016A0, last = 0x016FF, otf="runr",  description = "Runic" },
497    ["samaritan"]                                   = { first = 0x00800, last = 0x0083F,              description = "Samaritan" },
498    ["saurashtra"]                                  = { first = 0x0A880, last = 0x0A8DF,              description = "Saurashtra" },
499    ["sharada"]                                     = { first = 0x11180, last = 0x111DF,              description = "Sharada" },
500    ["shavian"]                                     = { first = 0x10450, last = 0x1047F, otf="shaw",  description = "Shavian" },
501    ["shorthandformatcontrols"]                     = { first = 0x1BCA0, last = 0x1BCAF,              description = "Shorthand Format Controls" },
502    ["siddham"]                                     = { first = 0x11580, last = 0x115FF,              description = "Siddham" },
503    ["sinhala"]                                     = { first = 0x00D80, last = 0x00DFF, otf="sinh",  description = "Sinhala" },
504    ["sinhalaarchaicnumbers"]                       = { first = 0x111E0, last = 0x111FF,              description = "Sinhala Archaic Numbers" },
505    ["smallformvariants"]                           = { first = 0x0FE50, last = 0x0FE6F,              description = "Small Form Variants" },
506    ["smallkanaextension"]                          = { first = 0x1B130, last = 0x1B16F,              description = "Small Kana Extension" },
507    ["sogdian"]                                     = { first = 0x10F30, last = 0x10F6F,              description = "Sogdian" },
508    ["sorasompeng"]                                 = { first = 0x110D0, last = 0x110FF,              description = "Sora Sompeng" },
509    ["soyombo"]                                     = { first = 0x11A50, last = 0x11AAF,              description = "Soyombo" },
510    ["spacingmodifierletters"]                      = { first = 0x002B0, last = 0x002FF,              description = "Spacing Modifier Letters" },
511    ["specials"]                                    = { first = 0x0FFF0, last = 0x0FFFF,              description = "Specials" },
512    ["sundanese"]                                   = { first = 0x01B80, last = 0x01BBF,              description = "Sundanese" },
513    ["sundanesesupplement"]                         = { first = 0x01CC0, last = 0x01CCF,              description = "Sundanese Supplement" },
514    ["superscriptsandsubscripts"]                   = { first = 0x02070, last = 0x0209F,              description = "Superscripts and Subscripts" },
515    ["supplementalarrowsa"]                         = { first = 0x027F0, last = 0x027FF, math = true, description = "Supplemental Arrows-A" },
516    ["supplementalarrowsb"]                         = { first = 0x02900, last = 0x0297F, math = true, description = "Supplemental Arrows-B" },
517    ["supplementalarrowsc"]                         = { first = 0x1F800, last = 0x1F8FF, math = true, description = "Supplemental Arrows-C" },
518    ["supplementalmathematicaloperators"]           = { first = 0x02A00, last = 0x02AFF, math = true, description = "Supplemental Mathematical Operators" },
519    ["supplementalpunctuation"]                     = { first = 0x02E00, last = 0x02E7F,              description = "Supplemental Punctuation" },
520    ["supplementalsymbolsandpictographs"]           = { first = 0x1F900, last = 0x1F9FF,              description = "Supplemental Symbols and Pictographs" },
521    ["supplementaryprivateuseareaa"]                = { first = 0xF0000, last = 0xFFFFF,              description = "Supplementary Private Use Area-A" },
522    ["supplementaryprivateuseareab"]                = { first = 0x100000,last = 0x10FFFF,             description = "Supplementary Private Use Area-B" },
523    ["suttonsignwriting"]                           = { first = 0x1D800, last = 0x1DAAF,              description = "Sutton SignWriting" },
524    ["sylotinagri"]                                 = { first = 0x0A800, last = 0x0A82F, otf="sylo",  description = "Syloti Nagri" },
525    ["symbolsandpictographsextendeda"]              = { first = 0x1FA70, last = 0x1FAFF,              description = "Symbols and Pictographs Extended-A" },
526    ["symbolsforlegacycomputing"]                   = { first = 0x1FB00, last = 0x1FBFF,              description = "Symbols for Legacy Computing" },
527    ["syriac"]                                      = { first = 0x00700, last = 0x0074F, otf="syrc",  description = "Syriac" },
528    ["syriacsupplement"]                            = { first = 0x00860, last = 0x0086F,              description = "Syriac Supplement" },
529    ["tagalog"]                                     = { first = 0x01700, last = 0x0171F, otf="tglg",  description = "Tagalog" },
530    ["tagbanwa"]                                    = { first = 0x01760, last = 0x0177F, otf="tagb",  description = "Tagbanwa" },
531    ["tags"]                                        = { first = 0xE0000, last = 0xE007F,              description = "Tags" },
532    ["taile"]                                       = { first = 0x01950, last = 0x0197F, otf="tale",  description = "Tai Le" },
533    ["taitham"]                                     = { first = 0x01A20, last = 0x01AAF,              description = "Tai Tham" },
534    ["taiviet"]                                     = { first = 0x0AA80, last = 0x0AADF,              description = "Tai Viet" },
535    ["taixuanjingsymbols"]                          = { first = 0x1D300, last = 0x1D35F,              description = "Tai Xuan Jing Symbols" },
536    ["takri"]                                       = { first = 0x11680, last = 0x116CF,              description = "Takri" },
537    ["tamil"]                                       = { first = 0x00B80, last = 0x00BFF, otf="taml",  description = "Tamil" },
538    ["tamilsupplement"]                             = { first = 0x11FC0, last = 0x11FFF,              description = "Tamil Supplement" },
539    ["tangut"]                                      = { first = 0x17000, last = 0x187FF,              description = "Tangut" },
540    ["tangutsupplement"]                            = { first = 0x18D00, last = 0x18D7F,              description = "Tangut Supplement" },
541    ["tangutcomponents"]                            = { first = 0x18800, last = 0x18AFF,              description = "Tangut Components" },
542    ["tangsa"]                                      = { first = 0x16A70, last = 0x16ACF,              description = "Tangsa" },
543    ["telugu"]                                      = { first = 0x00C00, last = 0x00C7F, otf="telu",  description = "Telugu" },
544    ["thaana"]                                      = { first = 0x00780, last = 0x007BF, otf="thaa",  description = "Thaana" },
545    ["thai"]                                        = { first = 0x00E00, last = 0x00E7F, otf="thai",  description = "Thai" },
546    ["tibetan"]                                     = { first = 0x00F00, last = 0x00FFF, otf="tibt",  description = "Tibetan" },
547    ["tifinagh"]                                    = { first = 0x02D30, last = 0x02D7F, otf="tfng",  description = "Tifinagh" },
548    ["tirhuta"]                                     = { first = 0x11480, last = 0x114DF,              description = "Tirhuta" },
549    ["toto"]                                        = { first = 0x1E290, last = 0x1E2BF,              description = "Toto" },
550    ["transportandmapsymbols"]                      = { first = 0x1F680, last = 0x1F6FF,              description = "Transport and Map Symbols" },
551    ["ugaritic"]                                    = { first = 0x10380, last = 0x1039F, otf="ugar",  description = "Ugaritic" },
552    ["unifiedcanadianaboriginalsyllabics"]          = { first = 0x01400, last = 0x0167F, otf="cans",  description = "Unified Canadian Aboriginal Syllabics" },
553    ["unifiedcanadianaboriginalsyllabicsextended"]  = { first = 0x018B0, last = 0x018FF,              description = "Unified Canadian Aboriginal Syllabics Extended" },
554    ["unifiedcanadianaboriginalsyllabicsextendeda"] = { first = 0x11AB0, last = 0x11ABF,              description = "Unified Canadian Aboriginal Syllabics Extended-A" },
555    ["uppercasebold"]                               = { first = 0x1D400, last = 0x1D419, math = true },
556    ["uppercaseboldfraktur"]                        = { first = 0x1D56C, last = 0x1D585, math = true },
557    ["uppercasebolditalic"]                         = { first = 0x1D468, last = 0x1D481, math = true, italic = true },
558    ["uppercaseboldscript"]                         = { first = 0x1D4D0, last = 0x1D4E9, math = true, italic = true },
559    ["uppercasedoublestruck"]                       = { first = 0x1D538, last = 0x1D551, math = true }, -- gaps are filled in elsewhere
560    ["uppercasefraktur"]                            = { first = 0x1D504, last = 0x1D51D, math = true },
561    ["uppercasegreekbold"]                          = { first = 0x1D6A8, last = 0x1D6C1, math = true },
562    ["uppercasegreekbolditalic"]                    = { first = 0x1D71C, last = 0x1D735, math = true, italic = true },
563    ["uppercasegreekitalic"]                        = { first = 0x1D6E2, last = 0x1D6FB, math = true, italic = true },
564    ["uppercasegreeknormal"]                        = { first = 0x00391, last = 0x003AA, math = true },
565    ["uppercasegreeksansserifbold"]                 = { first = 0x1D756, last = 0x1D76F, math = true },
566    ["uppercasegreeksansserifbolditalic"]           = { first = 0x1D790, last = 0x1D7A9, math = true, italic = true },
567    ["uppercaseitalic"]                             = { first = 0x1D434, last = 0x1D44D, math = true, italic = true },
568    ["uppercasemonospace"]                          = { first = 0x1D670, last = 0x1D689, math = true },
569    ["uppercasenormal"]                             = { first = 0x00041, last = 0x0005A, math = true },
570    ["uppercasesansserifbold"]                      = { first = 0x1D5D4, last = 0x1D5ED, math = true },
571    ["uppercasesansserifbolditalic"]                = { first = 0x1D63C, last = 0x1D655, math = true, italic = true },
572    ["uppercasesansserifitalic"]                    = { first = 0x1D608, last = 0x1D621, math = true, italic = true },
573    ["uppercasesansserifnormal"]                    = { first = 0x1D5A0, last = 0x1D5B9, math = true },
574    ["uppercasescript"]                             = { first = 0x1D49C, last = 0x1D4B5, math = true, italic = true },
575    ["vai"]                                         = { first = 0x0A500, last = 0x0A63F,              description = "Vai" },
576    ["variationselectors"]                          = { first = 0x0FE00, last = 0x0FE0F,              description = "Variation Selectors" },
577    ["variationselectorssupplement"]                = { first = 0xE0100, last = 0xE01EF,              description = "Variation Selectors Supplement" },
578    ["vedicextensions"]                             = { first = 0x01CD0, last = 0x01CFF,              description = "Vedic Extensions" },
579    ["verticalforms"]                               = { first = 0x0FE10, last = 0x0FE1F,              description = "Vertical Forms" },
580    ["vithkuqi"]                                    = { first = 0x10570, last = 0x105BF,              description = "Vithkuqi" },
581    ["wancho"]                                      = { first = 0x1E2C0, last = 0x1E2FF,              description = "Wancho" },
582    ["warangciti"]                                  = { first = 0x118A0, last = 0x118FF,              description = "Warang Citi" },
583    ["yezidi"]                                      = { first = 0x10E80, last = 0x10EBF,              description = "Yezidi" },
584    ["yijinghexagramsymbols"]                       = { first = 0x04DC0, last = 0x04DFF, otf="yi",    description = "Yijing Hexagram Symbols" },
585    ["yiradicals"]                                  = { first = 0x0A490, last = 0x0A4CF, otf="yi",    description = "Yi Radicals" },
586    ["yisyllables"]                                 = { first = 0x0A000, last = 0x0A48F, otf="yi",    description = "Yi Syllables" },
587    ["zanabazarsquare"]                             = { first = 0x11A00, last = 0x11A4F,              description = "Zanabazar Square" },
588    ["znamennymusicalnotation"]                     = { first = 0x1CF00, last = 0x1CFCF,              description = "Znamenny Musical Notation" },
589
590    -- The calligraphic shapes are different from script shapes but don't have a dedicated
591    -- range so we make one. An example of a font that has them is Lucida but we also drop
592    -- them into other fonts.
593
594    ["lowercasecalligraphic"]     = { first = 0x100000, last = 0x100019, math = true },
595    ["uppercasecalligraphic"]     = { first = 0x100020, last = 0x100039, math = true },
596    ["lowercaseboldcalligraphic"] = { first = 0x100040, last = 0x100059, math = true },
597    ["uppercaseboldcalligraphic"] = { first = 0x100060, last = 0x100079, math = true },
598
599    -- At the same time we reserve(d) some extra greek alphabets and surprise, a font like
600    -- stixtwo actually has them, so we enable them now.
601
602    ["lowercasesansgreek"]        = { first = 0x100080, last = 0x100099, math = true },
603    ["uppercasesansgreek"]        = { first = 0x1000A0, last = 0x1000B9, math = true },
604    ["lowercaseitalicsansgreek"]  = { first = 0x1000C0, last = 0x1000D9, math = true },
605    ["uppercaseitalicsansgreek"]  = { first = 0x1000E0, last = 0x1000F9, math = true },
606
607    -- Maybe this one also makes sense, although the fact that all these extra alphabets
608    -- were not made part of unicode math (combined with the holes in alphabets) indicates
609    -- that usage was not perceived.
610
611    ["lowercaseblackboarditalic"] = { first = 0x100100, last = 0x100119, math = true },
612    ["uppercaseblackboarditalic"] = { first = 0x100120, last = 0x100139, math = true },
613
614    -- Anyway, all permutations at some point might show up, but it might take decades
615    -- before the tex math dev community catches on. In the end it is upto microsoft to
616    -- take the lead, just as with other unicode math and fonts.
617    --
618    -- Also, it would be a bit pathetic to add more alphabets natively to unicode at
619    -- the one hand while accepting these gaps in existing alphabets and not having a
620    -- native upright greek math alphabet either in order to distringuish from greek
621    -- text. Either we go semantic ore go shapes, but a mix is only confusing.
622
623}
624
625-- moved from math-act.lua to here:
626
627-- operators    : 0x02200
628-- symbolsa     : 0x02701
629-- symbolsb     : 0x02901
630-- supplemental : 0x02A00
631
632blocks.lowercaseitalic.gaps = {
633    [0x1D455] = 0x0210E, -- ℎ h
634}
635
636blocks.uppercasescript.gaps = {
637    [0x1D49D] = 0x0212C, -- ℬ script B
638    [0x1D4A0] = 0x02130, -- ℰ script E
639    [0x1D4A1] = 0x02131, -- ℱ script F
640    [0x1D4A3] = 0x0210B, -- ℋ script H
641    [0x1D4A4] = 0x02110, -- ℐ script I
642    [0x1D4A7] = 0x02112, -- ℒ script L
643    [0x1D4A8] = 0x02133, -- ℳ script M
644    [0x1D4AD] = 0x0211B, -- ℛ script R
645}
646
647blocks.lowercasescript.gaps = {
648    [0x1D4BA] = 0x0212F, -- ℯ script e
649    [0x1D4BC] = 0x0210A, -- ℊ script g
650    [0x1D4C4] = 0x02134, -- ℴ script o
651}
652
653blocks.uppercasefraktur.gaps = {
654    [0x1D506] = 0x0212D, -- ℭ fraktur C
655    [0x1D50B] = 0x0210C, -- ℌ fraktur H
656    [0x1D50C] = 0x02111, -- ℑ fraktur I
657    [0x1D515] = 0x0211C, -- ℜ fraktur R
658    [0x1D51D] = 0x02128, -- ℨ fraktur Z
659}
660
661blocks.uppercasedoublestruck.gaps = {
662    [0x1D53A] = 0x02102, -- ℂ bb C
663    [0x1D53F] = 0x0210D, -- ℍ bb H
664    [0x1D545] = 0x02115, -- ℕ bb N
665    [0x1D547] = 0x02119, -- ℙ bb P
666    [0x1D548] = 0x0211A, -- ℚ bb Q
667    [0x1D549] = 0x0211D, -- ℝ bb R
668    [0x1D551] = 0x02124, -- ℤ bb Z
669}
670
671characters.blocks = blocks
672
673function characters.blockrange(name)
674    local b = blocks[name]
675    if b then
676        return b.first, b.last
677    else
678        return 0, 0
679    end
680end
681
682setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
683    return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
684end)
685
686local otfscripts      = utilities.storage.allocate()
687characters.otfscripts = otfscripts
688
689setmetatableindex(otfscripts,function(t,unicode)
690    for k, v in next, blocks do
691        local first = v.first
692        local last  = v.last
693        if unicode >= first and unicode <= last then
694            local script = v.otf or "dflt"
695            for u=first,last do
696                t[u] = script
697            end
698            return script
699        end
700    end
701    -- pretty slow when we're here
702    t[unicode] = "dflt"
703    return "dflt"
704end)
705
706local splitter1 = lpeg.splitat(S(":-"))
707local splitter2 = lpeg.splitat(S(" +-"),true)
708
709function characters.getrange(name,expression) -- used in font fallback definitions (name or range)
710    local range = rawget(blocks,lower(gsub(name,"[^a-zA-Z0-9]","")))
711    if range then
712        return range.first, range.last, range.description, range.gaps
713    end
714    name = gsub(name,'"',"0x") -- goodie: tex hex notation
715    local start, stop
716    if expression then
717        local n = tonumber(name)
718        if n then
719            return n, n, nil
720        else
721            local first, rest = lpegmatch(splitter2,name)
722            local range = rawget(blocks,lower(gsub(first,"[^a-zA-Z0-9]","")))
723            if range then
724                local s = loadstring("return 0 " .. rest)
725                if type(s) == "function" then
726                    local d = s()
727                    if type(d) == "number" then
728                        return range.first + d, range.last + d, nil
729                    end
730                end
731            end
732        end
733    end
734    local start, stop = lpegmatch(splitter1,name)
735    if start and stop then
736        start = tonumber(start,16) or tonumber(start)
737        stop  = tonumber(stop, 16) or tonumber(stop)
738        if start and stop then
739            return start, stop, nil
740        end
741    end
742    local slot = tonumber(name,16) or tonumber(name)
743    return slot, slot, nil
744end
745
746-- print(characters.getrange("lowercaseitalic + 123",true))
747-- print(characters.getrange("lowercaseitalic + 124",true))
748
749local categorytags = allocate {
750    lu = "Letter Uppercase",
751    ll = "Letter Lowercase",
752    lt = "Letter Titlecase",
753    lm = "Letter Modifier",
754    lo = "Letter Other",
755    mn = "Mark Nonspacing",
756    mc = "Mark Spacing Combining",
757    me = "Mark Enclosing",
758    nd = "Number Decimal Digit",
759    nl = "Number Letter",
760    no = "Number Other",
761    pc = "Punctuation Connector",
762    pd = "Punctuation Dash",
763    ps = "Punctuation Open",
764    pe = "Punctuation Close",
765    pi = "Punctuation Initial Quote",
766    pf = "Punctuation Final Quote",
767    po = "Punctuation Other",
768    sm = "Symbol Math",
769    sc = "Symbol Currency",
770    sk = "Symbol Modifier",
771    so = "Symbol Other",
772    zs = "Separator Space",
773    zl = "Separator Line",
774    zp = "Separator Paragraph",
775    cc = "Other Control",
776    cf = "Other Format",
777    cs = "Other Surrogate",
778    co = "Other Private Use",
779    cn = "Other Not Assigned",
780}
781
782local detailtags = allocate {
783    sl = "small letter",
784    bl = "big letter",
785    im = "iteration mark",
786    pm = "prolonged sound mark"
787}
788
789characters.categorytags = categorytags
790characters.detailtags   = detailtags
791
792-- sounds : voiced unvoiced semivoiced
793
794--~ special   : cf (softhyphen) zs (emspace)
795--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
796
797local is_character = allocate ( tohash {
798    "lu","ll","lt","lm","lo",
799    "nd","nl","no",
800    "mn",
801    "nl","no",
802    "pc","pd","ps","pe","pi","pf","po",
803    "sm","sc","sk","so"
804} )
805
806local is_letter = allocate ( tohash {
807    "ll","lm","lo","lt","lu"
808} )
809
810local is_command = allocate ( tohash {
811    "cf","zs"
812} )
813
814local is_spacing = allocate ( tohash {
815    "zs", "zl","zp",
816} )
817
818local is_mark = allocate ( tohash {
819    "mn", "ms", -- "mn", "mc",
820} )
821
822local is_punctuation = allocate ( tohash {
823    "pc", "pd", "ps", "pe", "pi", "pf", "po",
824} )
825
826local is_hyphenator = allocate ( tohash {
827    "pd",
828} )
829
830local is_symbol = allocate ( tohash {
831    "sm", "sc", "sk", "so",
832} )
833
834local can_have_space = allocate ( tohash {
835    "lu", "ll", "lt", "lm", "lo", -- letters
836 -- "mn", "mc", "me",             -- marks
837    "nd", "nl", "no",             -- numbers
838    "ps", "pi",                   -- initial
839 -- "pe", "pf",                   -- final
840 -- "pc", "pd", "po",             -- punctuation
841    "sm", "sc", "sk", "so",       -- symbols
842 -- "zs", "zl", "zp",             -- separators
843 -- "cc", "cf", "cs", "co", "cn", -- others
844} )
845
846
847-- to be redone: store checked characters
848
849characters.is_character   = is_character
850characters.is_letter      = is_letter
851characters.is_command     = is_command
852characters.is_spacing     = is_spacing
853characters.is_mark        = is_mark
854characters.is_punctuation = is_punctuation
855characters.is_hyphenator  = is_hyphenator
856characters.is_symbol      = is_symbol
857characters.can_have_space = can_have_space
858
859local mti = function(t,k)
860    if type(k) == "number" then
861        local c = data[k].category
862        return c and rawget(t,c)
863    else
864        -- avoid auto conversion in data.characters lookups
865    end
866end
867
868setmetatableindex(characters.is_character,   mti)
869setmetatableindex(characters.is_letter,      mti)
870setmetatableindex(characters.is_command,     mti)
871setmetatableindex(characters.is_spacing,     mti)
872setmetatableindex(characters.is_punctuation, mti)
873setmetatableindex(characters.is_hyphenator,  mti)
874setmetatableindex(characters.is_symbol,      mti)
875setmetatableindex(characters.can_have_space, mti)
876
877-- todo: also define callers for the above
878
879-- linebreak: todo: hash
880--
881-- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
882-- informative : XX OP CL CP QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 HL CJ RI
883--
884-- U+03400..U+04DBF ID
885-- U+04E00..U+09FFF ID
886-- U+0F900..U+0FAFF ID
887-- U+20000..U+2FFFD ID
888-- U+30000..U+3FFFD ID
889-- U+1F000..U+1FAFF ID
890-- U+1FC00..U+1FFFD ID
891-- U+020A0..U+020CF PR
892
893characters.linebreaks = allocate {
894
895    -- non-tailorable line breaking classes
896
897    ["bk"]  = "mandatory break",                             -- nl, ps : cause a line break (after)
898    ["cr"]  = "carriage return",                             -- cr : cause a line break (after), except between cr and lf
899    ["lf"]  = "line feed",                                   -- lf : cause a line break (after)
900    ["cm"]  = "combining mark",                              -- combining marks, control codes : prohibit a line break between the character and the preceding character
901    ["nl"]  = "next line",                                   -- nel : cause a line break (after)
902    ["sg"]  = "surrogate",                                   -- surrogates :do not occur in well-formed text
903    ["wj"]  = "word joiner",                                 -- wj : prohibit line breaks before and after
904    ["zw"]  = "zero width space",                            -- zwsp : provide a break opportunity
905    ["gl"]  = "non-breaking (glue)",                         -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
906    ["sp"]  = "space",                                       -- space : enable indirect line breaks
907    ["zwj"] = "zero width joiner",                           -- prohibit line breaks within joiner sequences
908
909    -- break opportunities
910
911    ["b2"] = "break opportunity before and after",           -- em dash : provide a line break opportunity before and after the character
912    ["ba"] = "break after",                                  -- spaces, hyphens : generally provide a line break opportunity after the character
913    ["bb"] = "break before",                                 -- punctuation used in dictionaries : generally provide a line break opportunity before the character
914    ["hy"] = "hyphen",                                       -- hyphen-minus : provide a line break opportunity after the character, except in numeric context
915    ["cb"] = "contingent break opportunity",                 -- inline objects : provide a line break opportunity contingent on additional information
916
917    -- characters prohibiting certain breaks
918
919    ["cl"] = "close punctuation",                            -- “}”, “❳”, “⟫” etc. : prohibit line breaks before
920    ["cp"] = "close parenthesis",                            -- “)”, “]” : prohibit line breaks before
921    ["ex"] = "exclamation/interrogation",                    -- “!”, “?”, etc. : prohibit line breaks before
922    ["in"] = "inseparable",                                  -- leaders : allow only indirect line breaks between pairs
923    ["ns"] = "nonstarter",                                   -- “‼”, “‽”, “⁇”, “⁉”, etc. : allow only indirect line breaks before
924    ["op"] = "open punctuation",                             -- “(“, “[“, “{“, etc. : prohibit line breaks after
925    ["qu"] = "quotation",                                    -- quotation marks : act like they are both opening and closing
926
927    -- numeric context
928
929    ["is"] = "infix numeric separator",                      -- . , : prevent breaks after any and before numeric
930    ["nu"] = "numeric",                                      -- digits : form numeric expressions for line breaking purposes
931    ["po"] = "postfix numeric",                              -- %, ¢ : do not break following a numeric expression
932    ["pr"] = "prefix numeric",                               -- $, £, ¥, etc. : do not break in front of a numeric expression
933    ["sy"] = "symbols allowing break after",                 -- / : prevent a break before, and allow a break after
934
935    -- other characters
936
937    ["ai"] = "ambiguous (alphabetic or ideographic)",        -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
938    ["ak"] = "aksara",                                       -- Consonants
939    ["al"] = "alphabetic",                                   -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
940    ["ap"] = "aksara pre-pase",                              -- pre-base repha
941    ["as"] = "ksara start",                                  -- independent vowels
942    ["cj"] = "conditional japanese starter",                 -- small kana : treat as ns or id for strict or normal breaking.
943    ["eb"] = "emoji base",                                   -- all emoji allowing modifiers, do not break from following emoji modifier
944    ["em"] = "emoji modifier",                               -- skin tone modifiers, do not break from preceding emoji base
945    ["h2"] = "hangul lv syllable",                           -- hangul : form korean syllable blocks
946    ["h3"] = "hangul lvt syllable",                          -- hangul : form korean syllable blocks
947    ["hl"] = "hebrew letter",                                -- hebrew : do not break around a following hyphen; otherwise act as alphabetic
948    ["id"] = "ideographic",                                  -- ideographs : break before or after, except in some numeric context
949    ["jl"] = "hangul l jamo",                                -- conjoining jamo : form korean syllable blocks
950    ["jt"] = "hangul t jamo",                                -- conjoining jamo : form korean syllable blocks
951    ["jv"] = "hangul v jamo",                                -- conjoining jamo : form korean syllable blocks
952    ["ri"] = "regional indicator",                           -- regional indicator symbol letter a .. z : keep together, break before and after from others
953    ["sa"] = "complex context dependent (south east asian)", -- south east asian: thai, lao, khmer : provide a line break opportunity contingent on additional, language-specific context analysis
954    ["vf"] = "virama final",                                 -- Viramas for final consonants
955    ["vi"] = "virama",                                       -- Conjoining viramas
956    ["xx"] = "unknown",                                      -- most unassigned, private-use : have as yet unknown line breaking behavior or unassigned code positions
957
958}
959
960-- east asian width:
961--
962-- N A H W F Na
963
964characters.bidi = allocate {
965    l   = "Left-to-Right",
966    lre = "Left-to-Right Embedding",
967    lro = "Left-to-Right Override",
968    r   = "Right-to-Left",
969    al  = "Right-to-Left Arabic",
970    rle = "Right-to-Left Embedding",
971    rlo = "Right-to-Left Override",
972    pdf = "Pop Directional Format",
973    en  = "European Number",
974    es  = "European Number Separator",
975    et  = "European Number Terminator",
976    an  = "Arabic Number",
977    cs  = "Common Number Separator",
978    nsm = "Non-Spacing Mark",
979    bn  = "Boundary Neutral",
980    b   = "Paragraph Separator",
981    s   = "Segment Separator",
982    ws  = "Whitespace",
983    on  = "Other Neutrals",
984}
985
986-- At this point we assume that the big data table is loaded. From this table we
987-- derive a few more.
988
989if not characters.fallbacks then
990
991    characters.fallbacks = allocate {
992        [0x0308] = 0x00A8, [0x00A8] = 0x0308, -- dieresiscmb      dieresis
993        [0x0304] = 0x00AF, [0x00AF] = 0x0304, -- macroncmb        macron
994        [0x0301] = 0x00B4, [0x00B4] = 0x0301, -- acutecomb        acute
995        [0x0327] = 0x00B8, [0x00B8] = 0x0327, -- cedillacmb       cedilla
996        [0x0302] = 0x02C6, [0x02C6] = 0x0302, -- circumflexcmb    circumflex
997        [0x030C] = 0x02C7, [0x02C7] = 0x030C, -- caroncmb         caron
998        [0x0306] = 0x02D8, [0x02D8] = 0x0306, -- brevecmb         breve
999        [0x0307] = 0x02D9, [0x02D9] = 0x0307, -- dotaccentcmb     dotaccent
1000        [0x030A] = 0x02DA, [0x02DA] = 0x030A, -- ringcmb          ring
1001        [0x0328] = 0x02DB, [0x02DB] = 0x0328, -- ogonekcmb        ogonek
1002        [0x0303] = 0x02DC, [0x02DC] = 0x0303, -- tildecomb        tilde
1003        [0x030B] = 0x02DD, [0x02DD] = 0x030B, -- hungarumlautcmb  hungarumlaut
1004        [0x0305] = 0x203E, [0x203E] = 0x0305, -- overlinecmb      overline
1005        [0x0300] = 0x0060, [0x0060] = 0x0333, -- gravecomb        grave
1006    }
1007
1008    -- not done (would mess up mapping):
1009    --
1010    -- 0X0301/0X0384 0X0314/0X1FFE 0X0313/0X1FBD 0X0313/0X1FBF 0X0342/0X1FC0
1011    -- 0X3099/0X309B 0X309A/0X309C 0X0333/0X2017 0X0345/0X037A
1012
1013end
1014
1015if storage then -- in case we extend
1016    storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
1017end
1018
1019characters.directions  = { }
1020
1021setmetatableindex(characters.directions,function(t,k)
1022    local d = data[k]
1023    if d then
1024        local v = d.direction
1025        if v then
1026            t[k] = v
1027            return v
1028        end
1029    end
1030    t[k] = false -- maybe 'l'
1031    return false
1032end)
1033
1034characters.mirrors  = { }
1035
1036setmetatableindex(characters.mirrors,function(t,k)
1037    local d = data[k]
1038    if d then
1039        local v = d.mirror
1040        if v then
1041            t[k] = v
1042            return v
1043        end
1044    end
1045    t[k] = false
1046    return false
1047end)
1048
1049characters.textclasses  = { }
1050
1051setmetatableindex(characters.textclasses,function(t,k)
1052    local d = data[k]
1053    if d then
1054        local v = d.textclass
1055        if v then
1056            t[k] = v
1057            return v
1058        end
1059    end
1060    t[k] = false
1061    return false
1062end)
1063
1064-- Next comes a whole series of helper methods. These are (will be) part of the
1065-- official API.
1066
1067-- we could make them virtual: characters.contextnames[n]
1068
1069function characters.contextname(n) return data[n] and data[n].contextname or "" end
1070function characters.adobename  (n) return data[n] and data[n].adobename   or "" end
1071function characters.description(n) return data[n] and data[n].description or "" end
1072-------- characters.category   (n) return data[n] and data[n].category    or "" end
1073
1074function characters.category(n,verbose)
1075    local c = data[n].category
1076    if not c then
1077        return ""
1078    elseif verbose then
1079        return categorytags[c]
1080    else
1081        return c
1082    end
1083end
1084
1085-- -- some day we will make a table .. not that many calls to utfchar
1086--
1087-- local utfchar = utf.char
1088-- local utfbyte = utf.byte
1089-- local utfbytes = { }
1090-- local utfchars = { }
1091--
1092-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
1093-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
1094
1095local function toutfstring(s)
1096    if type(s) == "table" then
1097        return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
1098    else
1099        return utfchar(s)
1100    end
1101end
1102
1103utf.tostring = toutfstring
1104
1105local categories = allocate()  characters.categories = categories -- lazy table
1106
1107setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
1108
1109-- todo: overloads (these register directly in the tables as number and string) e.g. for greek
1110-- todo: for string do a numeric lookup in the table itself
1111
1112local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
1113local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
1114local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
1115local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
1116
1117setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1118setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1119setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1120setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1121
1122local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
1123local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
1124local shchars = allocate()  characters.shchars = shchars -- lazy table
1125local fschars = allocate()  characters.fschars = fschars -- lazy table
1126
1127setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1128setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1129setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1130setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1131
1132local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
1133local specials   = allocate()  characters.specials   = specials     -- lazy table
1134
1135setmetatableindex(decomposed, function(t,u) -- either a table or false
1136    if u then
1137        local c = data[u]
1138        local s = c and c.decomposed or false -- could fall back to specials
1139        t[u] = s
1140        return s
1141    end
1142end)
1143
1144setmetatableindex(specials, function(t,u) -- either a table or false
1145    if u then
1146        local c = data[u]
1147        local s = c and c.specials or false
1148        t[u] = s
1149        return s
1150    end
1151end)
1152
1153local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
1154local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
1155local synonyms     = allocate()  characters.synonyms     = synonyms     -- lazy table
1156
1157setmetatableindex(specialchars, function(t,u)
1158    if u then
1159        local c = data[u]
1160        local s = c and c.specials
1161        if s then
1162            local tt  = { }
1163            local ttn = 0
1164            for i=2,#s do
1165                local si = s[i]
1166                local c = data[si]
1167                if is_letter[c.category] then
1168                    ttn = ttn + 1
1169                    tt[ttn] = utfchar(si)
1170                end
1171            end
1172            c = concat(tt)
1173            t[u] = c
1174            return c
1175        else
1176            if type(u) == "number" then
1177                u = utfchar(u)
1178            end
1179            t[u] = u
1180            return u
1181        end
1182    end
1183end)
1184
1185setmetatableindex(descriptions, function(t,k)
1186    -- 0.05 - 0.10 sec
1187    for u, c in next, data do
1188        local d = c.description
1189        if d then
1190            if find(d," ",1,true) then
1191                d = gsub(d," ","")
1192            end
1193            d = lower(d)
1194            t[d] = u
1195        end
1196    end
1197    local d = rawget(t,k)
1198    if not d then
1199        t[k] = k
1200    end
1201    return d
1202end)
1203
1204setmetatableindex(synonyms, function(t,k)
1205    for u, c in next, data do
1206        local s = c.synonyms
1207        if s then
1208            if find(s," ",1,true) then
1209                s = gsub(s," ","")
1210            end
1211         -- s = lower(s) -- is already lowercase
1212            t[s] = u
1213        end
1214    end
1215    local s = rawget(t,k)
1216    if not s then
1217        t[s] = s
1218    end
1219    return s
1220end)
1221
1222function characters.unicodechar(asked)
1223    local n = tonumber(asked)
1224    if n then
1225        return n
1226    elseif type(asked) == "string" then
1227        return descriptions[asked] or descriptions[gsub(asked," ","")]
1228    end
1229end
1230
1231-- function characters.lower(str)
1232--     local new, n = { }, 0
1233--     for u in utfvalues(str) do
1234--         n = n + 1
1235--         new[n] = lcchars[u]
1236--     end
1237--     return concat(new)
1238-- end
1239--
1240-- function characters.upper(str)
1241--     local new, n = { }, 0
1242--     for u in utfvalues(str) do
1243--         n = n + 1
1244--         new[n] = ucchars[u]
1245--     end
1246--     return concat(new)
1247-- end
1248--
1249-- function characters.shaped(str)
1250--     local new, n = { }, 0
1251--     for u in utfvalues(str) do
1252--         n = n + 1
1253--         new[n] = shchars[u]
1254--     end
1255--     return concat(new)
1256-- end
1257
1258----- tolower = Cs((utf8byte/lcchars)^0)
1259----- toupper = Cs((utf8byte/ucchars)^0)
1260----- toshape = Cs((utf8byte/shchars)^0)
1261
1262local tolower = Cs((utf8character/lcchars)^0) -- no need to check spacing
1263local toupper = Cs((utf8character/ucchars)^0) -- no need to check spacing
1264local toshape = Cs((utf8character/shchars)^0) -- no need to check spacing
1265
1266lpegpatterns.tolower = tolower -- old ones ... will be overloaded
1267lpegpatterns.toupper = toupper -- old ones ... will be overloaded
1268lpegpatterns.toshape = toshape -- old ones ... will be overloaded
1269
1270-- function characters.lower (str) return lpegmatch(tolower,str) end
1271-- function characters.upper (str) return lpegmatch(toupper,str) end
1272-- function characters.shaped(str) return lpegmatch(toshape,str) end
1273
1274--     local superscripts = allocate()   characters.superscripts = superscripts
1275--     local subscripts   = allocate()   characters.subscripts   = subscripts
1276
1277--     if storage then
1278--         storage.register("characters/superscripts", superscripts, "characters.superscripts")
1279--         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1280--     end
1281
1282-- end
1283
1284if not characters.splits then
1285
1286    local char   = allocate()
1287    local compat = allocate()
1288
1289    local splits = {
1290        char   = char,
1291        compat = compat,
1292    }
1293
1294    characters.splits = splits
1295
1296    -- [0x013F] = { 0x004C, 0x00B7 }
1297    -- [0x0140] = { 0x006C, 0x00B7 }
1298
1299    for unicode, data in next, characters.data do
1300        local specials = data.specials
1301        if specials and #specials > 2 then
1302            local kind = specials[1]
1303            if kind == "compat" then
1304                compat[unicode] = { unpack(specials,2) }
1305            elseif kind == "char" or kind == "with" then -- width added
1306                char  [unicode] = { unpack(specials,2) }
1307            end
1308        end
1309    end
1310
1311    if storage then
1312        storage.register("characters/splits", splits, "characters.splits")
1313    end
1314
1315end
1316
1317if not characters.lhash then
1318
1319    local lhash = allocate()   characters.lhash = lhash -- nil if no conversion
1320    local uhash = allocate()   characters.uhash = uhash -- nil if no conversion
1321    local shash = allocate()   characters.shash = shash -- nil if no conversion
1322
1323    for k, v in next, characters.data do
1324     -- if k < 0x11000 then
1325            local l = v.lccode
1326            if l then
1327                -- we have an uppercase
1328                if type(l) == "number" then
1329                    lhash[utfchar(k)] = utfchar(l)
1330                elseif #l == 2 then
1331                    lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2])
1332             -- else
1333             --     inspect(v)
1334                end
1335            else
1336                local u = v.uccode
1337                if u then
1338                    -- we have an lowercase
1339                    if type(u) == "number" then
1340                        uhash[utfchar(k)] = utfchar(u)
1341                    elseif #u == 2 then
1342                        uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2])
1343                 -- else
1344                 --     inspect(v)
1345                    end
1346                end
1347            end
1348            local s = v.shcode
1349            if s then
1350                if type(s) == "number" then
1351                    shash[utfchar(k)] = utfchar(s)
1352                elseif #s == 2 then
1353                    shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2])
1354             -- else
1355             --     inspect(v)
1356                end
1357            end
1358     -- end
1359    end
1360
1361    if storage then
1362        storage.register("characters/lhash", lhash, "characters.lhash")
1363        storage.register("characters/uhash", uhash, "characters.uhash")
1364        storage.register("characters/shash", shash, "characters.shash")
1365    end
1366
1367end
1368
1369local lhash = characters.lhash mark(lhash)
1370local uhash = characters.uhash mark(uhash)
1371local shash = characters.shash mark(shash)
1372
1373local utf8lowercharacter = utfchartabletopattern(lhash) / lhash
1374local utf8uppercharacter = utfchartabletopattern(uhash) / uhash
1375local utf8shapecharacter = utfchartabletopattern(shash) / shash
1376
1377local utf8lower = Cs((utf8lowercharacter + utf8character)^0)
1378local utf8upper = Cs((utf8uppercharacter + utf8character)^0)
1379local utf8shape = Cs((utf8shapecharacter + utf8character)^0)
1380
1381lpegpatterns.utf8lowercharacter = utf8lowercharacter -- one character
1382lpegpatterns.utf8uppercharacter = utf8uppercharacter -- one character
1383lpegpatterns.utf8shapecharacter = utf8shapecharacter -- one character
1384
1385lpegpatterns.utf8lower = utf8lower -- string
1386lpegpatterns.utf8upper = utf8upper -- string
1387lpegpatterns.utf8shape = utf8shape -- string
1388
1389function characters.lower (str) return str and lpegmatch(utf8lower,str) or "" end
1390function characters.upper (str) return str and lpegmatch(utf8upper,str) or "" end
1391function characters.shaped(str) return str and lpegmatch(utf8shape,str) or "" end
1392
1393lpeg.setutfcasers(characters.lower,characters.upper)
1394
1395-- local str = [[
1396--     ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
1397--     ÆÇæç         æçæç         ÆÇÆÇ         AECaec
1398--     ÈÉÊËèéêë     èéêëèéêë     ÈÉÊËÈÉÊË     EEEEeeee
1399--     ÌÍÎÏÞìíîïþ   ìíîïþìíîïþ   ÌÍÎÏÞÌÍÎÏÞ   IIIIÞiiiiþ
1400--     Ðð           ðð           ÐÐ           Ðð
1401--     Ññ           ññ           ÑÑ           Nn
1402--     ÒÓÔÕÖòóôõö   òóôõöòóôõö   ÒÓÔÕÖÒÓÔÕÖ   OOOOOooooo
1403--     Øø           øø           ØØ           Oo
1404--     ÙÚÛÜùúûü     ùúûüùúûü     ÙÚÛÜÙÚÛÜ     UUUUuuuu
1405--     Ýýÿ          ýýÿ          ÝÝŸ          Yyy
1406--     ß            ß            SS           ss
1407--     Ţţ           ţţ           ŢŢ           Tt
1408-- ]]
1409--
1410-- local lower  = characters.lower   print(lower(str))
1411-- local upper  = characters.upper   print(upper(str))
1412-- local shaped = characters.shaped  print(shaped(str))
1413--
1414-- local c, n = os.clock(), 10000
1415-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
1416-- print(os.clock()-c,n*#str*3)
1417
1418-- maybe: (twice as fast when much ascii)
1419--
1420-- local tolower  = lpeg.patterns.tolower
1421-- local lower    = string.lower
1422--
1423-- local allascii = R("\000\127")^1 * P(-1)
1424--
1425-- function characters.checkedlower(str)
1426--     return lpegmatch(allascii,str) and lower(str) or lpegmatch(tolower,str) or str
1427-- end
1428
1429function characters.lettered(str,spacing)
1430    local new, n = { }, 0
1431    if spacing then
1432        local done = false
1433        for u in utfvalues(str) do
1434            local c = data[u].category
1435            if is_letter[c] then
1436                if done and n > 1 then
1437                    n = n + 1
1438                    new[n] = " "
1439                    done = false
1440                end
1441                n = n + 1
1442                new[n] = utfchar(u)
1443            elseif spacing and is_spacing[c] then
1444                done = true
1445            end
1446        end
1447    else
1448        for u in utfvalues(str) do
1449            if is_letter[data[u].category] then
1450                n = n + 1
1451                new[n] = utfchar(u)
1452            end
1453        end
1454    end
1455    return concat(new)
1456end
1457
1458-- Requesting lower and uppercase codes:
1459
1460function characters.uccode(n) return uccodes[n] end -- obsolete
1461function characters.lccode(n) return lccodes[n] end -- obsolete
1462
1463function characters.shape(n)
1464    local shcode = shcodes[n]
1465    if not shcode then
1466        return n, nil
1467    elseif type(shcode) == "table" then
1468        return shcode[1], shcode[#shcode]
1469    else
1470        return shcode, nil
1471    end
1472end
1473
1474-- -- some day we might go this route, but it does not really save that much
1475-- -- so not now (we can generate a lot using mtx-unicode that operates on the
1476-- -- database)
1477--
1478-- -- category cjkwd direction linebreak
1479--
1480-- -- adobename comment contextcommand contextname description fallback lccode
1481-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
1482-- -- range shcode specials uccode uccodes unicodeslot
1483--
1484-- local data = {
1485--     ['one']={
1486--         common = {
1487--             category="cc",
1488--             direction="bn",
1489--             linebreak="cm",
1490--         },
1491--         vector = {
1492--             [0x0000] = {
1493--                 description="NULL",
1494--                 group='one',
1495--                 unicodeslot=0x0000,
1496--             },
1497--             {
1498--                 description="START OF HEADING",
1499--                 group='one',
1500--                 unicodeslot=0x0001,
1501--             },
1502--         }
1503--     }
1504-- }
1505--
1506-- local chardata, groupdata = { }, { }
1507--
1508-- for group, gdata in next, data do
1509--     local common, vector = { __index = gdata.common }, gdata.vector
1510--     for character, cdata in next, vector do
1511--         chardata[character] = cdata
1512--         setmetatable(cdata,common)
1513--     end
1514--     groupdata[group] = gdata
1515-- end
1516
1517-- characters.data, characters.groups = chardata, groupdata
1518
1519--  [0xF0000]={
1520--   category="co",
1521--   cjkwd="a",
1522--   description="<Plane 0x000F Private Use, First>",
1523--   direction="l",
1524--   unicodeslot=0xF0000,
1525--  },
1526--  [0xFFFFD]={
1527--   category="co",
1528--   cjkwd="a",
1529--   description="<Plane 0x000F Private Use, Last>",
1530--   direction="l",
1531--   unicodeslot=0xFFFFD,
1532--  },
1533--  [0x100000]={
1534--   category="co",
1535--   cjkwd="a",
1536--   description="<Plane 0x0010 Private Use, First>",
1537--   direction="l",
1538--   unicodeslot=0x100000,
1539--  },
1540--  [0x10FFFD]={
1541--   category="co",
1542--   cjkwd="a",
1543--   description="<Plane 0x0010 Private Use, Last>",
1544--   direction="l",
1545--   unicodeslot=0x10FFFD,
1546--  },
1547
1548if not characters.superscripts then
1549
1550    local superscripts = allocate()   characters.superscripts = superscripts
1551    local subscripts   = allocate()   characters.subscripts   = subscripts
1552    local fractions    = allocate()   characters.fractions    = fractions
1553
1554    -- skipping U+02120 (service mark) U+02122 (trademark)
1555
1556    for k, v in next, data do
1557        local specials = v.specials
1558        if specials then
1559            local what = specials[1]
1560            if what == "super" then
1561                if #specials == 2 then
1562                    superscripts[k] = specials[2]
1563                elseif trace_defining then
1564                    report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
1565                end
1566            elseif what == "sub" then
1567                if #specials == 2 then
1568                    subscripts[k] = specials[2]
1569                elseif trace_defining then
1570                    report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
1571                end
1572            elseif what == "fraction" then
1573                if #specials > 1 then
1574                    fractions[k] = { unpack(specials,2) }
1575                elseif trace_defining then
1576                    report_defining("ignoring %s %a, char %c, description %a","fraction",ustring(k),k,v.description)
1577                end
1578            end
1579        end
1580    end
1581
1582 -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
1583 -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
1584 -- print(table.serialize(fractions,    "fractions",    { hexify = true }))
1585
1586    if storage then
1587        storage.register("characters/superscripts", superscripts, "characters.superscripts")
1588        storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1589        storage.register("characters/fractions",    fractions,    "characters.fractions")
1590    end
1591
1592end
1593
1594function characters.showstring(str)
1595    local list = utotable(str)
1596    for i=1,#list do
1597        report_defining("split % 3i : %C",i,list[i])
1598    end
1599end
1600
1601do
1602
1603    -- There is no need to preload this table.
1604
1605    local any       = P(1)
1606    local special   = S([['".,:;-+()]])
1607                    + P('') + P('')
1608    local apostrofe = P("") + P("'")
1609
1610    local pattern = Cs ( (
1611        (P("medium light") / "medium-light" + P("medium dark")  / "medium-dark") * P(" skin tone")
1612        + (apostrofe * P("s"))/""
1613        + special/""
1614        + any
1615    )^1)
1616
1617    local function load()
1618        local name = resolvers.findfile("char-emj.lua")
1619        local data = name and name ~= "" and dofile(name) or { }
1620        local hash = { }
1621        for d, c in next, data do
1622            local k = lpegmatch(pattern,d) or d
1623            local u = { }
1624            for i=1,#c do
1625                u[i] = utfchar(c[i])
1626            end
1627            u = concat(u)
1628            hash[k] = u
1629        end
1630        return data, hash
1631    end
1632
1633    local data, hash = nil, nil
1634
1635    function characters.emojized(name)
1636        local t = lpegmatch(pattern,name)
1637        if t then
1638            return t
1639        else
1640            return { name }
1641        end
1642    end
1643
1644    local start     = P(" ")
1645    local finish    = P(-1) + P(" ")
1646    local skintone  = P("medium ")^0 * (P("light ") + P("dark "))^0 * P("skin tone")
1647    local gender    = P("woman") + P("man")
1648    local expanded  = (
1649                            P("m-l-")/"medium-light"
1650                          + P("m-d-")/"medium-dark"
1651                          + P("l-")  /"light"
1652                          + P("m-")  /"medium"
1653                          + P("d-")  /"dark"
1654                      )
1655                    * (P("s-t")/" skin tone")
1656    local compacted = (
1657                        (P("medium-")/"m-" * (P("light")/"l" + P("dark")/"d"))
1658                      + (P("medium")/"m"   +  P("light")/"l" + P("dark")/"d")
1659                      )
1660                    * (P(" skin tone")/"-s-t")
1661
1662    local pattern_0 = Cs((expanded + any)^1)
1663    local pattern_1 = Cs(((start * skintone + skintone * finish)/"" + any)^1)
1664    local pattern_2 = Cs(((start * gender   + gender   * finish)/"" + any)^1)
1665    local pattern_4 = Cs((compacted + any)^1)
1666
1667 -- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t"))
1668 -- print(lpegmatch(pattern_0,"something m-l-s-t"))
1669 -- print(lpegmatch(pattern_0,"something m-s-t"))
1670 -- print(lpegmatch(pattern_4,"something medium-light skin tone"))
1671 -- print(lpegmatch(pattern_4,"something medium skin tone"))
1672
1673    local skin =
1674        P("light skin tone")        / utfchar(0x1F3FB)
1675      + P("medium-light skin tone") / utfchar(0x1F3FC)
1676      + P("medium skin tone")       / utfchar(0x1F3FD)
1677      + P("medium-dark skin tone")  / utfchar(0x1F3FE)
1678      + P("dark skin tone")         / utfchar(0x1F3FF)
1679
1680    local parent =
1681        P("man")   / utfchar(0x1F468)
1682      + P("woman") / utfchar(0x1F469)
1683
1684    local child =
1685        P("baby")  / utfchar(0x1F476)
1686      + P("boy")   / utfchar(0x1F466)
1687      + P("girl")  / utfchar(0x1F467)
1688
1689    local zwj   = utfchar(0x200D)
1690    local heart = utfchar(0x2764) .. utfchar(0xFE0F) .. zwj
1691    local kiss  = utfchar(0x2764) .. utfchar(0xFE0F) .. utfchar(0x200D) .. utfchar(0x1F48B) .. zwj
1692
1693    ----- member = parent + child
1694
1695    local space = P(" ")
1696    local final = P(-1)
1697
1698    local p_done   = (space^1/zwj) + P(-1)
1699    local p_rest   = space/"" * (skin * p_done) + p_done
1700    local p_parent = parent * p_rest
1701    local p_child  = child  * p_rest
1702
1703    local p_family = Cs ( (P("family")            * space^1)/"" * p_parent^-2 * p_child^-2 )
1704    local p_couple = Cs ( (P("couple with heart") * space^1)/"" * p_parent * Cc(heart) * p_parent )
1705    local p_kiss   = Cs ( (P("kiss")              * space^1)/"" * p_parent * Cc(kiss)  * p_parent )
1706
1707    local p_special = p_family + p_couple + p_kiss
1708
1709 -- print(lpeg.match(p_special,"family man woman girl"))
1710 -- print(lpeg.match(p_special,"family man dark skin tone woman girl girl"))
1711
1712 -- local p_special = P { "all",
1713 --     all    = Cs (V("family") + V("couple") + V("kiss")),
1714 --     family = C("family")            * space^1 * V("parent")^-2 * V("child")^-2,
1715 --     couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"),
1716 --     kiss   = P("kiss")              * space^1 * V("parent") * Cc(kiss) * V("parent"),
1717 --     parent = parent * V("rest"),
1718 --     child  = child  * V("rest"),
1719 --     rest   = (space * skin)^0/"" * ((space^1/zwj) + P(-1)),
1720 -- }
1721
1722    local emoji      = { }
1723    characters.emoji = emoji
1724
1725local cache = setmetatable({ }, { __mode = "k" } )
1726
1727    function emoji.resolve(name)
1728        if not hash then
1729            data, hash = load()
1730        end
1731        local h = hash[name]
1732        if h then
1733            return h
1734        end
1735        local h = cache[name]
1736        if h then
1737            return h
1738        elseif h == false then
1739            return
1740        end
1741        -- expand shortcuts
1742        local name = lpegmatch(pattern_0,name) or name
1743        -- expand some 25K variants
1744        local h = lpegmatch(p_special,name)
1745        if h then
1746            cache[name] = h
1747            return h
1748        end
1749        -- simplify
1750        local s = lpegmatch(pattern_1,name)
1751        local h = hash[s]
1752        if h then
1753            cache[name] = h
1754            return h
1755        end
1756        -- simplify
1757        local s = lpegmatch(pattern_2,name)
1758        local h = hash[s]
1759        if h then
1760            cache[name] = h
1761            return h
1762        end
1763        cache[name] = false
1764    end
1765
1766    function emoji.known()
1767        if not hash then
1768            data, hash = load()
1769        end
1770        return hash, data
1771    end
1772
1773    function emoji.compact(name)
1774        return lpegmatch(pattern_4,name) or name
1775    end
1776
1777end
1778
1779-- code moved to char-tex.lua
1780
1781return characters
1782