SourceBrowser

char-ini.lua /size: 99 Kb last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['char-ini'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- todo: make two files, one for format generation, one for format use
10-- todo: move some to char-utf
11
12-- we can remove the tag range starting at 0xE0000 (special applications)
13
14local utfchar, utfbyte, utfvalues, ustring, utotable = utf.char, utf.byte, utf.values, utf.ustring, utf.totable
15local concat, unpack, tohash, insert = table.concat, table.unpack, table.tohash, table.insert
16local next, tonumber, type, rawget, rawset = next, tonumber, type, rawget, rawset
17local format, lower, gsub, find = string.format, string.lower, string.gsub, string.find
18local P, R, S, C, Cs, Ct, Cc, V = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.Cc, lpeg.V
19local formatters = string.formatters
20
21if not characters then require("char-def") end
22
23local lpegpatterns          = lpeg.patterns
24local lpegmatch             = lpeg.match
25local utf8byte              = lpegpatterns.utf8byte
26local utf8character         = lpegpatterns.utf8character
27
28local utfchartabletopattern = lpeg.utfchartabletopattern
29
30local allocate              = utilities.storage.allocate
31local mark                  = utilities.storage.mark
32
33local setmetatableindex     = table.setmetatableindex
34
35local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
36
37local report_defining       = logs.reporter("characters")
38
39-- This module implements some methods and creates additional datastructured from
40-- the big character table that we use for all kind of purposes: 'char-def.lua'.
41--
42-- We assume that at this point 'characters.data' is already populated!
43--
44-- todo: in 'char-def.lua' assume defaults:
45--
46--   directions = l
47--   cjkwd      = a
48--   linebreak  = al
49
50characters       = characters or { }
51local characters = characters
52local data       = characters.data
53
54if data then
55    mark(data) -- why does this fail
56else
57    report_defining("fatal error: 'char-def.lua' is not loaded")
58    os.exit()
59end
60
61-- Extending the table.
62
63if context and CONTEXTLMTXMODE == 0 then
64
65    if not characters.private then
66
67        require("char-prv")
68
69        if storage then
70            storage.register("characters/private", characters.private, "characters.private")
71        end
72
73    end
74
75    for unicode, d in next, characters.private do
76        data[unicode] = d
77    end
78
79end
80
81-- This converts a string (if given) into a number.
82
83local pattern = (P("0x") + P("U+")) * ((R("09","AF")^1 * P(-1)) / function(s) return tonumber(s,16) end)
84
85lpegpatterns.chartonumber = pattern
86
87local function chartonumber(k)
88    if type(k) == "string" then
89        local u = lpegmatch(pattern,k)
90        if u then
91            return utfbyte(u)
92        else
93            return utfbyte(k) or 0
94        end
95    else
96        return k or 0
97    end
98end
99
100local function charfromnumber(k)
101    if type(k) == "number" then
102        return utfchar(k) or ""
103    else
104        local u = lpegmatch(pattern,k)
105        if u then
106            return utfchar(u)
107        else
108            return k
109        end
110    end
111end
112
113--~ print(chartonumber(97), chartonumber("a"), chartonumber("0x61"), chartonumber("U+61"))
114
115characters.tonumber   = chartonumber
116characters.fromnumber = charfromnumber
117
118local private = {
119    description = "PRIVATE SLOT",
120}
121
122local ranges      = allocate()
123characters.ranges = ranges
124
125setmetatableindex(data, function(t,k)
126    local tk = type(k)
127    if tk == "string" then
128        k = lpegmatch(pattern,k) or utfbyte(k)
129        if k then
130            local v = rawget(t,k)
131            if v then
132                return v
133            else
134                tk = "number" -- fall through to range
135            end
136        else
137            return private
138        end
139    end
140    if tk == "number" and k < 0xF0000 then
141        for r=1,#ranges do
142            local rr = ranges[r]
143            if k >= rr.first and k <= rr.last then
144                local extender = rr.extender
145                if extender then
146                    local v = extender(k)
147                    t[k] = v
148                    return v
149                end
150            end
151        end
152    end
153    return private -- handy for when we loop over characters in fonts and check for a property
154end)
155
156local variant_selector_metatable = {
157    category  = "mn",
158    cjkwd     = "a",
159    direction = "nsm",
160    linebreak = "cm",
161}
162
163-- This saves a bit of memory and also serves as example.
164
165local f_variant = string.formatters["VARIATION SELECTOR-0x%04X"]
166
167insert(characters.ranges,{
168    first    = 0xFE00,
169    last     = 0xFE0F,
170    name     = "variant selector",
171    extender = function(k)
172        local t = {
173            description = f_variant(k - 0xFE00 + 0x0001),
174            unicodeslot = k,
175        }
176        setmetatable(t,variant_selector_metatable)
177        return t
178    end,
179})
180
181insert(characters.ranges,{
182    first    = 0xE0100,
183    last     = 0xE01EF,
184    name     = "variant selector extension",
185    extender = function(k)
186        local t = {
187            description = f_variant(k - 0xE0100 + 0x0011),
188            unicodeslot = k,
189        }
190        setmetatable(t,variant_selector_metatable)
191        return t
192    end,
193})
194
195local blocks = allocate {
196    ["adlam"]                                       = { first = 0x1E900, last = 0x1E95F,              description = "Adlam" },
197    ["aegeannumbers"]                               = { first = 0x10100, last = 0x1013F,              description = "Aegean Numbers" },
198    ["ahom"]                                        = { first = 0x11700, last = 0x1174F,              description = "Ahom" },
199    ["alchemicalsymbols"]                           = { first = 0x1F700, last = 0x1F77F,              description = "Alchemical Symbols" },
200    ["alphabeticpresentationforms"]                 = { first = 0x0FB00, last = 0x0FB4F, otf="latn",  description = "Alphabetic Presentation Forms" },
201    ["anatolianhieroglyphs"]                        = { first = 0x14400, last = 0x1467F,              description = "Anatolian Hieroglyphs" },
202    ["ancientgreekmusicalnotation"]                 = { first = 0x1D200, last = 0x1D24F, otf="grek",  description = "Ancient Greek Musical Notation" },
203    ["ancientgreeknumbers"]                         = { first = 0x10140, last = 0x1018F, otf="grek",  description = "Ancient Greek Numbers" },
204    ["ancientsymbols"]                              = { first = 0x10190, last = 0x101CF, otf="grek",  description = "Ancient Symbols" },
205    ["arabic"]                                      = { first = 0x00600, last = 0x006FF, otf="arab",  description = "Arabic" },
206    ["arabicextendeda"]                             = { first = 0x008A0, last = 0x008FF,              description = "Arabic Extended-A" },
207    ["arabicextendedb"]                             = { first = 0x00870, last = 0x0089F,              description = "Arabic Extended-B" },
208    ["arabicextendedc"]                             = { first = 0x10EC0, last = 0x10EFF,              description = "Arabic Extended-C" },
209    ["arabicmathematicalalphabeticsymbols"]         = { first = 0x1EE00, last = 0x1EEFF,              description = "Arabic Mathematical Alphabetic Symbols" },
210    ["arabicpresentationformsa"]                    = { first = 0x0FB50, last = 0x0FDFF, otf="arab",  description = "Arabic Presentation Forms-A" },
211    ["arabicpresentationformsb"]                    = { first = 0x0FE70, last = 0x0FEFF, otf="arab",  description = "Arabic Presentation Forms-B" },
212    ["arabicsupplement"]                            = { first = 0x00750, last = 0x0077F, otf="arab",  description = "Arabic Supplement" },
213    ["armenian"]                                    = { first = 0x00530, last = 0x0058F, otf="armn",  description = "Armenian" },
214    ["arrows"]                                      = { first = 0x02190, last = 0x021FF,              description = "Arrows" },
215    ["avestan"]                                     = { first = 0x10B00, last = 0x10B3F,              description = "Avestan" },
216    ["balinese"]                                    = { first = 0x01B00, last = 0x01B7F, otf="bali",  description = "Balinese" },
217    ["bamum"]                                       = { first = 0x0A6A0, last = 0x0A6FF,              description = "Bamum" },
218    ["bamumsupplement"]                             = { first = 0x16800, last = 0x16A3F,              description = "Bamum Supplement" },
219    ["basiclatin"]                                  = { first = 0x00000, last = 0x0007F, otf="latn",  description = "Basic Latin" },
220    ["bassavah"]                                    = { first = 0x16AD0, last = 0x16AFF,              description = "Bassa Vah" },
221    ["batak"]                                       = { first = 0x01BC0, last = 0x01BFF,              description = "Batak" },
222    ["bengali"]                                     = { first = 0x00980, last = 0x009FF, otf="beng",  description = "Bengali" },
223    ["bhaiksuki"]                                   = { first = 0x11C00, last = 0x11C6F,              description = "Bhaiksuki" },
224    ["blockelements"]                               = { first = 0x02580, last = 0x0259F, otf="bopo",  description = "Block Elements" },
225    ["bopomofo"]                                    = { first = 0x03100, last = 0x0312F, otf="bopo",  description = "Bopomofo" },
226    ["bopomofoextended"]                            = { first = 0x031A0, last = 0x031BF, otf="bopo",  description = "Bopomofo Extended" },
227    ["boxdrawing"]                                  = { first = 0x02500, last = 0x0257F,              description = "Box Drawing" },
228    ["brahmi"]                                      = { first = 0x11000, last = 0x1107F,              description = "Brahmi" },
229    ["braillepatterns"]                             = { first = 0x02800, last = 0x028FF, otf="brai",  description = "Braille Patterns" },
230    ["buginese"]                                    = { first = 0x01A00, last = 0x01A1F, otf="bugi",  description = "Buginese" },
231    ["buhid"]                                       = { first = 0x01740, last = 0x0175F, otf="buhd",  description = "Buhid" },
232    ["byzantinemusicalsymbols"]                     = { first = 0x1D000, last = 0x1D0FF, otf="byzm",  description = "Byzantine Musical Symbols" },
233    ["carian"]                                      = { first = 0x102A0, last = 0x102DF,              description = "Carian" },
234    ["caucasianalbanian"]                           = { first = 0x10530, last = 0x1056F,              description = "Caucasian Albanian" },
235    ["chakma"]                                      = { first = 0x11100, last = 0x1114F,              description = "Chakma" },
236    ["cham"]                                        = { first = 0x0AA00, last = 0x0AA5F,              description = "Cham" },
237    ["cherokee"]                                    = { first = 0x013A0, last = 0x013FF, otf="cher",  description = "Cherokee" },
238    ["cherokeesupplement"]                          = { first = 0x0AB70, last = 0x0ABBF,              description = "Cherokee Supplement" },
239    ["chesssymbols"]                                = { first = 0x1FA00, last = 0x1FA6F,              description = "Chess Symbols" },
240    ["chorasmian"]                                  = { first = 0x10FB0, last = 0x10FDF,              description = "Chorasmian" },
241    ["cjkcompatibility"]                            = { first = 0x03300, last = 0x033FF, otf="hang",  description = "CJK Compatibility" },
242    ["cjkcompatibilityforms"]                       = { first = 0x0FE30, last = 0x0FE4F, otf="hang",  description = "CJK Compatibility Forms" },
243    ["cjkcompatibilityideographs"]                  = { first = 0x0F900, last = 0x0FAFF, otf="hang",  description = "CJK Compatibility Ideographs" },
244    ["cjkcompatibilityideographssupplement"]        = { first = 0x2F800, last = 0x2FA1F, otf="hang",  description = "CJK Compatibility Ideographs Supplement" },
245    ["cjkradicalssupplement"]                       = { first = 0x02E80, last = 0x02EFF, otf="hang",  description = "CJK Radicals Supplement" },
246    ["cjkstrokes"]                                  = { first = 0x031C0, last = 0x031EF, otf="hang",  description = "CJK Strokes" },
247    ["cjksymbolsandpunctuation"]                    = { first = 0x03000, last = 0x0303F, otf="hang",  description = "CJK Symbols and Punctuation" },
248    ["cjkunifiedideographs"]                        = { first = 0x04E00, last = 0x09FFF, otf="hang",  description = "CJK Unified Ideographs", catcode = "letter" },
249    ["cjkunifiedideographsextensiona"]              = { first = 0x03400, last = 0x04DBF, otf="hang",  description = "CJK Unified Ideographs Extension A" },
250    ["cjkunifiedideographsextensionb"]              = { first = 0x20000, last = 0x2A6DF, otf="hang",  description = "CJK Unified Ideographs Extension B" },
251    ["cjkunifiedideographsextensionc"]              = { first = 0x2A700, last = 0x2B73F,              description = "CJK Unified Ideographs Extension C" },
252    ["cjkunifiedideographsextensiond"]              = { first = 0x2B740, last = 0x2B81F,              description = "CJK Unified Ideographs Extension D" },
253    ["cjkunifiedideographsextensione"]              = { first = 0x2B820, last = 0x2CEAF,              description = "CJK Unified Ideographs Extension E" },
254    ["cjkunifiedideographsextensionf"]              = { first = 0x2CEB0, last = 0x2EBEF,              description = "CJK Unified Ideographs Extension F" },
255    ["cjkunifiedideographsextensiong"]              = { first = 0x30000, last = 0x3134F,              description = "CJK Unified Ideographs Extension G" },
256    ["cjkunifiedideographsextensionh"]              = { first = 0x31350, last = 0x323AF,              description = "CJK Unified Ideographs Extension H" },
257    ["cjkunifiedideographsextensioni"]              = { first = 0x2EBF0, last = 0x2EE5F,              description = "CJK Unified Ideographs Extension I" },
258    ["combiningdiacriticalmarks"]                   = { first = 0x00300, last = 0x0036F,              description = "Combining Diacritical Marks" },
259    ["combiningdiacriticalmarksextended"]           = { first = 0x01AB0, last = 0x01AFF,              description = "Combining Diacritical Marks Extended" },
260    ["combiningdiacriticalmarksforsymbols"]         = { first = 0x020D0, last = 0x020FF,              description = "Combining Diacritical Marks for Symbols" },
261    ["combiningdiacriticalmarkssupplement"]         = { first = 0x01DC0, last = 0x01DFF,              description = "Combining Diacritical Marks Supplement" },
262    ["combininghalfmarks"]                          = { first = 0x0FE20, last = 0x0FE2F,              description = "Combining Half Marks" },
263    ["commonindicnumberforms"]                      = { first = 0x0A830, last = 0x0A83F,              description = "Common Indic Number Forms" },
264    ["controlpictures"]                             = { first = 0x02400, last = 0x0243F,              description = "Control Pictures" },
265    ["coptic"]                                      = { first = 0x02C80, last = 0x02CFF, otf="copt",  description = "Coptic" },
266    ["copticepactnumbers"]                          = { first = 0x102E0, last = 0x102FF,              description = "Coptic Epact Numbers" },
267    ["countingrodnumerals"]                         = { first = 0x1D360, last = 0x1D37F,              description = "Counting Rod Numerals" },
268    ["cuneiform"]                                   = { first = 0x12000, last = 0x123FF, otf="xsux",  description = "Cuneiform" },
269    ["cuneiformnumbersandpunctuation"]              = { first = 0x12400, last = 0x1247F, otf="xsux",  description = "Cuneiform Numbers and Punctuation" },
270    ["currencysymbols"]                             = { first = 0x020A0, last = 0x020CF,              description = "Currency Symbols" },
271    ["cypriotsyllabary"]                            = { first = 0x10800, last = 0x1083F, otf="cprt",  description = "Cypriot Syllabary" },
272    ["cyprominoan"]                                 = { first = 0x12F90, last = 0x12FFF,              description = "Cypro-Minoan" },
273    ["cyrillic"]                                    = { first = 0x00400, last = 0x004FF, otf="cyrl",  description = "Cyrillic" },
274    ["cyrillicextendeda"]                           = { first = 0x02DE0, last = 0x02DFF, otf="cyrl",  description = "Cyrillic Extended-A" },
275    ["cyrillicextendedb"]                           = { first = 0x0A640, last = 0x0A69F, otf="cyrl",  description = "Cyrillic Extended-B" },
276    ["cyrillicextendedc"]                           = { first = 0x01C80, last = 0x01C8F,              description = "Cyrillic Extended-C" },
277    ["cyrillicextendedd"]                           = { first = 0x1E030, last = 0x1E08F,              description = "Cyrillic Extended-D" },
278    ["cyrillicsupplement"]                          = { first = 0x00500, last = 0x0052F, otf="cyrl",  description = "Cyrillic Supplement" },
279    ["deseret"]                                     = { first = 0x10400, last = 0x1044F, otf="dsrt",  description = "Deseret" },
280    ["devanagari"]                                  = { first = 0x00900, last = 0x0097F, otf="deva",  description = "Devanagari" },
281    ["devanagariextended"]                          = { first = 0x0A8E0, last = 0x0A8FF,              description = "Devanagari Extended" },
282    ["devanagariextendeda"]                         = { first = 0x11B00, last = 0x11B5F,              description = "Devanagari Extended-A" },
283    ["digitsarabicindic"]                           = { first = 0x00660, last = 0x00669, math = true },
284 -- ["digitsbengali"]                               = { first = 0x009E6, last = 0x009EF, math = true },
285    ["digitsbold"]                                  = { first = 0x1D7CE, last = 0x1D7D7, math = true },
286 -- ["digitsdevanagari"]                            = { first = 0x00966, last = 0x0096F, math = true },
287    ["digitsdoublestruck"]                          = { first = 0x1D7D8, last = 0x1D7E1, math = true },
288 -- ["digitsethiopic"]                              = { first = 0x01369, last = 0x01371, math = true },
289    ["digitsextendedarabicindic"]                   = { first = 0x006F0, last = 0x006F9, math = true },
290 -- ["digitsgujarati"]                              = { first = 0x00AE6, last = 0x00AEF, math = true },
291 -- ["digitsgurmukhi"]                              = { first = 0x00A66, last = 0x00A6F, math = true },
292 -- ["digitskannada"]                               = { first = 0x00CE6, last = 0x00CEF, math = true },
293 -- ["digitskhmer"]                                 = { first = 0x017E0, last = 0x017E9, math = true },
294 -- ["digitslao"]                                   = { first = 0x00ED0, last = 0x00ED9, math = true },
295    ["digitslatin"]                                 = { first = 0x00030, last = 0x00039, math = true },
296 -- ["digitsmalayalam"]                             = { first = 0x00D66, last = 0x00D6F, math = true },
297 -- ["digitsmongolian"]                             = { first = 0x01810, last = 0x01809, math = true },
298    ["digitsmonospace"]                             = { first = 0x1D7F6, last = 0x1D7FF, math = true },
299 -- ["digitsmyanmar"]                               = { first = 0x01040, last = 0x01049, math = true },
300    ["digitsnormal"]                                = { first = 0x00030, last = 0x00039, math = true },
301 -- ["digitsoriya"]                                 = { first = 0x00B66, last = 0x00B6F, math = true },
302    ["digitssansserifbold"]                         = { first = 0x1D7EC, last = 0x1D7F5, math = true },
303    ["digitssansserifnormal"]                       = { first = 0x1D7E2, last = 0x1D7EB, math = true },
304 -- ["digitstamil"]                                 = { first = 0x00030, last = 0x00039, math = true }, -- no zero
305 -- ["digitstelugu"]                                = { first = 0x00C66, last = 0x00C6F, math = true },
306 -- ["digitsthai"]                                  = { first = 0x00E50, last = 0x00E59, math = true },
307 -- ["digitstibetan"]                               = { first = 0x00F20, last = 0x00F29, math = true },
308    ["dingbats"]                                    = { first = 0x02700, last = 0x027BF,              description = "Dingbats" },
309    ["divesakuru"]                                  = { first = 0x11900, last = 0x1195F,              description = "Dives Akuru" },
310    ["dogra"]                                       = { first = 0x11800, last = 0x1184F,              description = "Dogra" },
311    ["dominotiles"]                                 = { first = 0x1F030, last = 0x1F09F,              description = "Domino Tiles" },
312    ["duployan"]                                    = { first = 0x1BC00, last = 0x1BC9F,              description = "Duployan" },
313    ["earlydynasticcuneiform"]                      = { first = 0x12480, last = 0x1254F,              description = "Early Dynastic Cuneiform" },
314    ["egyptianhieroglyphformatcontrols"]            = { first = 0x13430, last = 0x1345F,              description = "Egyptian Hieroglyph Format Controls" },
315    ["egyptianhieroglyphs"]                         = { first = 0x13000, last = 0x1342F,              description = "Egyptian Hieroglyphs" },
316    ["egyptianhieroglyphsextendeda"]                = { first = 0x13460, last = 0x143FF,              description = "Egyptian Hieroglyphs Extended-A" },
317    ["elbasan"]                                     = { first = 0x10500, last = 0x1052F,              description = "Elbasan" },
318    ["elymaic"]                                     = { first = 0x10FE0, last = 0x10FFF,              description = "Elymaic" },
319    ["emoticons"]                                   = { first = 0x1F600, last = 0x1F64F,              description = "Emoticons" },
320    ["enclosedalphanumerics"]                       = { first = 0x02460, last = 0x024FF,              description = "Enclosed Alphanumerics" },
321    ["enclosedalphanumericsupplement"]              = { first = 0x1F100, last = 0x1F1FF,              description = "Enclosed Alphanumeric Supplement" },
322    ["enclosedcjklettersandmonths"]                 = { first = 0x03200, last = 0x032FF,              description = "Enclosed CJK Letters and Months" },
323    ["enclosedideographicsupplement"]               = { first = 0x1F200, last = 0x1F2FF,              description = "Enclosed Ideographic Supplement" },
324    ["ethiopic"]                                    = { first = 0x01200, last = 0x0137F, otf="ethi",  description = "Ethiopic" },
325    ["ethiopicextended"]                            = { first = 0x02D80, last = 0x02DDF, otf="ethi",  description = "Ethiopic Extended" },
326    ["ethiopicextendeda"]                           = { first = 0x0AB00, last = 0x0AB2F,              description = "Ethiopic Extended-A" },
327    ["ethiopicextendedb"]                           = { first = 0x1E7E0, last = 0x1E7FF,              description = "Ethiopic Extended-B" },
328    ["ethiopicsupplement"]                          = { first = 0x01380, last = 0x0139F, otf="ethi",  description = "Ethiopic Supplement" },
329    ["garay"]                                       = { first = 0x10D40, last = 0x10D8F,              description = "Garay" },
330    ["generalpunctuation"]                          = { first = 0x02000, last = 0x0206F,              description = "General Punctuation" },
331    ["geometricshapes"]                             = { first = 0x025A0, last = 0x025FF, math = true, description = "Geometric Shapes" },
332    ["geometricshapesextended"]                     = { first = 0x1F780, last = 0x1F7FF,              description = "Geometric Shapes Extended" },
333    ["georgian"]                                    = { first = 0x010A0, last = 0x010FF, otf="geor",  description = "Georgian" },
334    ["georgianextended"]                            = { first = 0x01C90, last = 0x01CBF,              description = "Georgian Extended" },
335    ["georgiansupplement"]                          = { first = 0x02D00, last = 0x02D2F, otf="geor",  description = "Georgian Supplement" },
336    ["glagolitic"]                                  = { first = 0x02C00, last = 0x02C5F, otf="glag",  description = "Glagolitic" },
337    ["glagoliticsupplement"]                        = { first = 0x1E000, last = 0x1E02F,              description = "Glagolitic Supplement" },
338    ["gothic"]                                      = { first = 0x10330, last = 0x1034F, otf="goth",  description = "Gothic" },
339    ["grantha"]                                     = { first = 0x11300, last = 0x1137F,              description = "Grantha" },
340    ["greekandcoptic"]                              = { first = 0x00370, last = 0x003FF, otf="grek",  description = "Greek and Coptic" },
341    ["greekextended"]                               = { first = 0x01F00, last = 0x01FFF, otf="grek",  description = "Greek Extended" },
342    ["gujarati"]                                    = { first = 0x00A80, last = 0x00AFF, otf="gujr",  description = "Gujarati" },
343    ["gunjalagondi"]                                = { first = 0x11D60, last = 0x11DAF,              description = "Gunjala Gondi" },
344    ["gurmukhi"]                                    = { first = 0x00A00, last = 0x00A7F, otf="guru",  description = "Gurmukhi" },
345    ["gurungkhema"]                                 = { first = 0x16100, last = 0x1613F,              description = "Gurung Khema" },
346    ["halfwidthandfullwidthforms"]                  = { first = 0x0FF00, last = 0x0FFEF,              description = "Halfwidth and Fullwidth Forms" },
347    ["hangulcompatibilityjamo"]                     = { first = 0x03130, last = 0x0318F, otf="jamo",  description = "Hangul Compatibility Jamo" },
348    ["hanguljamo"]                                  = { first = 0x01100, last = 0x011FF, otf="jamo",  description = "Hangul Jamo" },
349    ["hanguljamoextendeda"]                         = { first = 0x0A960, last = 0x0A97F,              description = "Hangul Jamo Extended-A" },
350    ["hanguljamoextendedb"]                         = { first = 0x0D7B0, last = 0x0D7FF,              description = "Hangul Jamo Extended-B" },
351    ["hangulsyllables"]                             = { first = 0x0AC00, last = 0x0D7AF, otf="hang",  description = "Hangul Syllables" },
352    ["hanifirohingya"]                              = { first = 0x10D00, last = 0x10D3F,              description = "Hanifi Rohingya" },
353    ["hanunoo"]                                     = { first = 0x01720, last = 0x0173F, otf="hano",  description = "Hanunoo" },
354    ["hatran"]                                      = { first = 0x108E0, last = 0x108FF,              description = "Hatran" },
355    ["hebrew"]                                      = { first = 0x00590, last = 0x005FF, otf="hebr",  description = "Hebrew" },
356    ["highprivateusesurrogates"]                    = { first = 0x0DB80, last = 0x0DBFF,              description = "High Private Use Surrogates" },
357    ["highsurrogates"]                              = { first = 0x0D800, last = 0x0DB7F,              description = "High Surrogates" },
358    ["hiragana"]                                    = { first = 0x03040, last = 0x0309F, otf="kana",  description = "Hiragana" },
359    ["ideographicdescriptioncharacters"]            = { first = 0x02FF0, last = 0x02FFF,              description = "Ideographic Description Characters" },
360    ["ideographicsymbolsandpunctuation"]            = { first = 0x16FE0, last = 0x16FFF,              description = "Ideographic Symbols and Punctuation" },
361    ["imperialaramaic"]                             = { first = 0x10840, last = 0x1085F,              description = "Imperial Aramaic" },
362    ["indicsiyaqnumbers"]                           = { first = 0x1EC70, last = 0x1ECBF,              description = "Indic Siyaq Numbers" },
363    ["inscriptionalpahlavi"]                        = { first = 0x10B60, last = 0x10B7F,              description = "Inscriptional Pahlavi" },
364    ["inscriptionalparthian"]                       = { first = 0x10B40, last = 0x10B5F,              description = "Inscriptional Parthian" },
365    ["ipaextensions"]                               = { first = 0x00250, last = 0x002AF,              description = "IPA Extensions" },
366    ["javanese"]                                    = { first = 0x0A980, last = 0x0A9DF,              description = "Javanese" },
367    ["kaithi"]                                      = { first = 0x11080, last = 0x110CF,              description = "Kaithi" },
368    ["kaktoviknumerals"]                            = { first = 0x1D2C0, last = 0x1D2DF,              description = "Kaktovik Numerals" },
369    ["kanaextendeda"]                               = { first = 0x1B100, last = 0x1B12F,              description = "Kana Extended-A" },
370    ["kanaextendedb"]                               = { first = 0x1AFF0, last = 0x1AFFF,              description = "Kana Extended-B" },
371    ["kanasupplement"]                              = { first = 0x1B000, last = 0x1B0FF,              description = "Kana Supplement" },
372    ["kanbun"]                                      = { first = 0x03190, last = 0x0319F,              description = "Kanbun" },
373    ["kangxiradicals"]                              = { first = 0x02F00, last = 0x02FDF,              description = "Kangxi Radicals" },
374    ["kannada"]                                     = { first = 0x00C80, last = 0x00CFF, otf="knda",  description = "Kannada" },
375    ["katakana"]                                    = { first = 0x030A0, last = 0x030FF, otf="kana",  description = "Katakana" },
376    ["katakanaphoneticextensions"]                  = { first = 0x031F0, last = 0x031FF, otf="kana",  description = "Katakana Phonetic Extensions" },
377    ["kayahli"]                                     = { first = 0x0A900, last = 0x0A92F,              description = "Kayah Li" },
378    ["kawi"]                                        = { first = 0x11F00, last = 0x11F5F,              description = "Kawi" },
379    ["kharoshthi"]                                  = { first = 0x10A00, last = 0x10A5F, otf="khar",  description = "Kharoshthi" },
380    ["khitansmallscript"]                           = { first = 0x18B00, last = 0x18CFF,              description = "Khitan Small Script" },
381    ["khmer"]                                       = { first = 0x01780, last = 0x017FF, otf="khmr",  description = "Khmer" },
382    ["khmersymbols"]                                = { first = 0x019E0, last = 0x019FF, otf="khmr",  description = "Khmer Symbols" },
383    ["khojki"]                                      = { first = 0x11200, last = 0x1124F,              description = "Khojki" },
384    ["khudawadi"]                                   = { first = 0x112B0, last = 0x112FF,              description = "Khudawadi" },
385    ["kiratrai"]                                    = { first = 0x16D40, last = 0x16D7F,              description = "Kirat Rai" },
386    ["lao"]                                         = { first = 0x00E80, last = 0x00EFF, otf="lao",   description = "Lao" },
387    ["latinextendeda"]                              = { first = 0x00100, last = 0x0017F, otf="latn",  description = "Latin Extended-A" },
388    ["latinextendedadditional"]                     = { first = 0x01E00, last = 0x01EFF, otf="latn",  description = "Latin Extended Additional" },
389    ["latinextendedb"]                              = { first = 0x00180, last = 0x0024F, otf="latn",  description = "Latin Extended-B" },
390    ["latinextendedc"]                              = { first = 0x02C60, last = 0x02C7F, otf="latn",  description = "Latin Extended-C" },
391    ["latinextendedd"]                              = { first = 0x0A720, last = 0x0A7FF, otf="latn",  description = "Latin Extended-D" },
392    ["latinextendede"]                              = { first = 0x0AB30, last = 0x0AB6F,              description = "Latin Extended-E" },
393    ["latinextendedf"]                              = { first = 0x10780, last = 0x107BF,              description = "Latin Extended-F" },
394    ["latinextendedg"]                              = { first = 0x1DF00, last = 0x1DFFF,              description = "Latin Extended-G" },
395    ["latinsupplement"]                             = { first = 0x00080, last = 0x000FF, otf="latn",  description = "Latin-1 Supplement" },
396    ["lepcha"]                                      = { first = 0x01C00, last = 0x01C4F,              description = "Lepcha" },
397    ["letterlikesymbols"]                           = { first = 0x02100, last = 0x0214F, math = true, description = "Letterlike Symbols" },
398    ["limbu"]                                       = { first = 0x01900, last = 0x0194F, otf="limb",  description = "Limbu" },
399    ["lineara"]                                     = { first = 0x10600, last = 0x1077F,              description = "Linear A" },
400    ["linearbideograms"]                            = { first = 0x10080, last = 0x100FF, otf="linb",  description = "Linear B Ideograms" },
401    ["linearbsyllabary"]                            = { first = 0x10000, last = 0x1007F, otf="linb",  description = "Linear B Syllabary" },
402    ["lisu"]                                        = { first = 0x0A4D0, last = 0x0A4FF,              description = "Lisu" },
403    ["lisusupplement"]                              = { first = 0x11FB0, last = 0x11FBF,              description = "Lisu Supplement" },
404    ["lowercasebold"]                               = { first = 0x1D41A, last = 0x1D433, math = true },
405    ["lowercaseboldfraktur"]                        = { first = 0x1D586, last = 0x1D59F, math = true },
406    ["lowercasebolditalic"]                         = { first = 0x1D482, last = 0x1D49B, math = true, italic = true },
407    ["lowercaseboldscript"]                         = { first = 0x1D4EA, last = 0x1D503, math = true, italic = true },
408    ["lowercasedoublestruck"]                       = { first = 0x1D552, last = 0x1D56B, math = true },
409    ["lowercasefraktur"]                            = { first = 0x1D51E, last = 0x1D537, math = true },
410    ["lowercasegreekbold"]                          = { first = 0x1D6C2, last = 0x1D6DB, math = true },
411    ["lowercasegreekbolditalic"]                    = { first = 0x1D736, last = 0x1D74F, math = true, italic = true },
412    ["lowercasegreekitalic"]                        = { first = 0x1D6FC, last = 0x1D715, math = true, italic = true },
413    ["lowercasegreeknormal"]                        = { first = 0x003B1, last = 0x003C9, math = true },
414    ["lowercasegreeksansserifbold"]                 = { first = 0x1D770, last = 0x1D789, math = true },
415    ["lowercasegreeksansserifbolditalic"]           = { first = 0x1D7AA, last = 0x1D7C3, math = true, italic = true },
416    ["lowercaseitalic"]                             = { first = 0x1D44E, last = 0x1D467, math = true, italic = true },
417    ["lowercasemonospace"]                          = { first = 0x1D68A, last = 0x1D6A3, math = true },
418    ["lowercasenormal"]                             = { first = 0x00061, last = 0x0007A, math = true },
419    ["lowercasesansserifbold"]                      = { first = 0x1D5EE, last = 0x1D607, math = true },
420    ["lowercasesansserifbolditalic"]                = { first = 0x1D656, last = 0x1D66F, math = true, italic = true },
421    ["lowercasesansserifitalic"]                    = { first = 0x1D622, last = 0x1D63B, math = true, italic = true },
422    ["lowercasesansserifnormal"]                    = { first = 0x1D5BA, last = 0x1D5D3, math = true },
423    ["lowercasescript"]                             = { first = 0x1D4B6, last = 0x1D4CF, math = true, italic = true },
424    ["lowsurrogates"]                               = { first = 0x0DC00, last = 0x0DFFF,              description = "Low Surrogates" },
425    ["lycian"]                                      = { first = 0x10280, last = 0x1029F,              description = "Lycian" },
426    ["lydian"]                                      = { first = 0x10920, last = 0x1093F,              description = "Lydian" },
427    ["mahajani"]                                    = { first = 0x11150, last = 0x1117F,              description = "Mahajani" },
428    ["mahjongtiles"]                                = { first = 0x1F000, last = 0x1F02F,              description = "Mahjong Tiles" },
429    ["makasar"]                                     = { first = 0x11EE0, last = 0x11EFF,              description = "Makasar" },
430    ["malayalam"]                                   = { first = 0x00D00, last = 0x00D7F, otf="mlym",  description = "Malayalam" },
431    ["mandaic"]                                     = { first = 0x00840, last = 0x0085F, otf="mand",  description = "Mandaic" },
432    ["manichaean"]                                  = { first = 0x10AC0, last = 0x10AFF,              description = "Manichaean" },
433    ["marchen"]                                     = { first = 0x11C70, last = 0x11CBF,              description = "Marchen" },
434    ["masaramgondi"]                                = { first = 0x11D00, last = 0x11D5F,              description = "Masaram Gondi" },
435    ["mathematicalalphanumericsymbols"]             = { first = 0x1D400, last = 0x1D7FF, math = true, description = "Mathematical Alphanumeric Symbols" },
436    ["mathematicaloperators"]                       = { first = 0x02200, last = 0x022FF, math = true, description = "Mathematical Operators" },
437    ["mayannumerals"]                               = { first = 0x1D2E0, last = 0x1D2FF,              description = "Mayan Numerals" },
438    ["medefaidrin"]                                 = { first = 0x16E40, last = 0x16E9F,              description = "Medefaidrin" },
439    ["meeteimayek"]                                 = { first = 0x0ABC0, last = 0x0ABFF,              description = "Meetei Mayek" },
440    ["meeteimayekextensions"]                       = { first = 0x0AAE0, last = 0x0AAFF,              description = "Meetei Mayek Extensions" },
441    ["mendekikakui"]                                = { first = 0x1E800, last = 0x1E8DF,              description = "Mende Kikakui" },
442    ["meroiticcursive"]                             = { first = 0x109A0, last = 0x109FF,              description = "Meroitic Cursive" },
443    ["meroitichieroglyphs"]                         = { first = 0x10980, last = 0x1099F,              description = "Meroitic Hieroglyphs" },
444    ["miao"]                                        = { first = 0x16F00, last = 0x16F9F,              description = "Miao" },
445    ["miscellaneousmathematicalsymbolsa"]           = { first = 0x027C0, last = 0x027EF, math = true, description = "Miscellaneous Mathematical Symbols-A" },
446    ["miscellaneousmathematicalsymbolsb"]           = { first = 0x02980, last = 0x029FF, math = true, description = "Miscellaneous Mathematical Symbols-B" },
447    ["miscellaneoussymbols"]                        = { first = 0x02600, last = 0x026FF, math = true, description = "Miscellaneous Symbols" },
448    ["miscellaneoussymbolsandarrows"]               = { first = 0x02B00, last = 0x02BFF, math = true, description = "Miscellaneous Symbols and Arrows" },
449    ["miscellaneoussymbolsandpictographs"]          = { first = 0x1F300, last = 0x1F5FF,              description = "Miscellaneous Symbols and Pictographs" },
450    ["miscellaneoustechnical"]                      = { first = 0x02300, last = 0x023FF, math = true, description = "Miscellaneous Technical" },
451    ["modi"]                                        = { first = 0x11600, last = 0x1165F,              description = "Modi" },
452    ["modifiertoneletters"]                         = { first = 0x0A700, last = 0x0A71F,              description = "Modifier Tone Letters" },
453    ["mongolian"]                                   = { first = 0x01800, last = 0x018AF, otf="mong",  description = "Mongolian" },
454    ["mongoliansupplement"]                         = { first = 0x11660, last = 0x1167F,              description = "Mongolian Supplement" },
455    ["mro"]                                         = { first = 0x16A40, last = 0x16A6F,              description = "Mro" },
456    ["multani"]                                     = { first = 0x11280, last = 0x112AF,              description = "Multani" },
457    ["musicalsymbols"]                              = { first = 0x1D100, last = 0x1D1FF, otf="musc",  description = "Musical Symbols" },
458    ["myanmar"]                                     = { first = 0x01000, last = 0x0109F, otf="mymr",  description = "Myanmar" },
459    ["myanmarextendeda"]                            = { first = 0x0AA60, last = 0x0AA7F,              description = "Myanmar Extended-A" },
460    ["myanmarextendedb"]                            = { first = 0x0A9E0, last = 0x0A9FF,              description = "Myanmar Extended-B" },
461    ["myanmarextendedc"]                            = { first = 0x116D0, last = 0x116FF,              description = "Myanmar Extended-C" },
462    ["nabataean"]                                   = { first = 0x10880, last = 0x108AF,              description = "Nabataean" },
463    ["nagmundari"]                                  = { first = 0x1E4D0, last = 0x1E4FF,              description = "Nag Mundari" },
464    ["nandinagari"]                                 = { first = 0x119A0, last = 0x119FF,              description = "Nandinagari" },
465    ["newa"]                                        = { first = 0x11400, last = 0x1147F,              description = "Newa" },
466    ["newtailue"]                                   = { first = 0x01980, last = 0x019DF,              description = "New Tai Lue" },
467    ["nko"]                                         = { first = 0x007C0, last = 0x007FF, otf="nko",   description = "NKo" },
468    ["numberforms"]                                 = { first = 0x02150, last = 0x0218F,              description = "Number Forms" },
469    ["nushu"]                                       = { first = 0x1B170, last = 0x1B2FF,              description = "Nushu" },
470    ["nyiakengpuachuehmong"]                        = { first = 0x1E100, last = 0x1E14F,              description = "Nyiakeng Puachue Hmong" },
471    ["ogham"]                                       = { first = 0x01680, last = 0x0169F, otf="ogam",  description = "Ogham" },
472    ["olchiki"]                                     = { first = 0x01C50, last = 0x01C7F,              description = "Ol Chiki" },
473    ["oldhungarian"]                                = { first = 0x10C80, last = 0x10CFF,              description = "Old Hungarian" },
474    ["olditalic"]                                   = { first = 0x10300, last = 0x1032F, otf="ital",  description = "Old Italic" },
475    ["oldnortharabian"]                             = { first = 0x10A80, last = 0x10A9F,              description = "Old North Arabian" },
476    ["oldpermic"]                                   = { first = 0x10350, last = 0x1037F,              description = "Old Permic" },
477    ["oldpersian"]                                  = { first = 0x103A0, last = 0x103DF, otf="xpeo",  description = "Old Persian" },
478    ["oldsogdian"]                                  = { first = 0x10F00, last = 0x10F2F,              description = "Old Sogdian" },
479    ["oldsoutharabian"]                             = { first = 0x10A60, last = 0x10A7F,              description = "Old South Arabian" },
480    ["oldturkic"]                                   = { first = 0x10C00, last = 0x10C4F,              description = "Old Turkic" },
481    ["olduyghur"]                                   = { first = 0x10F70, last = 0x10FAF,              description = "Old Uyghur" },
482    ["olonal"]                                      = { first = 0x1E5D0, last = 0x1E5FF,              description = "Ol Onal" },
483    ["opticalcharacterrecognition"]                 = { first = 0x02440, last = 0x0245F,              description = "Optical Character Recognition" },
484    ["oriya"]                                       = { first = 0x00B00, last = 0x00B7F, otf="orya",  description = "Oriya" },
485    ["ornamentaldingbats"]                          = { first = 0x1F650, last = 0x1F67F,              description = "Ornamental Dingbats" },
486    ["osage"]                                       = { first = 0x104B0, last = 0x104FF,              description = "Osage" },
487    ["osmanya"]                                     = { first = 0x10480, last = 0x104AF, otf="osma",  description = "Osmanya" },
488    ["ottomansiyaqnumbers"]                         = { first = 0x1ED00, last = 0x1ED4F,              description = "Ottoman Siyaq Numbers" },
489    ["pahawhhmong"]                                 = { first = 0x16B00, last = 0x16B8F,              description = "Pahawh Hmong" },
490    ["palmyrene"]                                   = { first = 0x10860, last = 0x1087F,              description = "Palmyrene" },
491    ["paucinhau"]                                   = { first = 0x11AC0, last = 0x11AFF,              description = "Pau Cin Hau" },
492    ["phagspa"]                                     = { first = 0x0A840, last = 0x0A87F, otf="phag",  description = "Phags-pa" },
493    ["phaistosdisc"]                                = { first = 0x101D0, last = 0x101FF,              description = "Phaistos Disc" },
494    ["phoenician"]                                  = { first = 0x10900, last = 0x1091F, otf="phnx",  description = "Phoenician" },
495    ["phoneticextensions"]                          = { first = 0x01D00, last = 0x01D7F,              description = "Phonetic Extensions" },
496    ["phoneticextensionssupplement"]                = { first = 0x01D80, last = 0x01DBF,              description = "Phonetic Extensions Supplement" },
497    ["playingcards"]                                = { first = 0x1F0A0, last = 0x1F0FF,              description = "Playing Cards" },
498    ["privateusearea"]                              = { first = 0x0E000, last = 0x0F8FF,              description = "Private Use Area" },
499    ["psalterpahlavi"]                              = { first = 0x10B80, last = 0x10BAF,              description = "Psalter Pahlavi" },
500    ["rejang"]                                      = { first = 0x0A930, last = 0x0A95F,              description = "Rejang" },
501    ["ruminumeralsymbols"]                          = { first = 0x10E60, last = 0x10E7F,              description = "Rumi Numeral Symbols" },
502    ["runic"]                                       = { first = 0x016A0, last = 0x016FF, otf="runr",  description = "Runic" },
503    ["samaritan"]                                   = { first = 0x00800, last = 0x0083F,              description = "Samaritan" },
504    ["saurashtra"]                                  = { first = 0x0A880, last = 0x0A8DF,              description = "Saurashtra" },
505    ["sharada"]                                     = { first = 0x11180, last = 0x111DF,              description = "Sharada" },
506    ["shavian"]                                     = { first = 0x10450, last = 0x1047F, otf="shaw",  description = "Shavian" },
507    ["shorthandformatcontrols"]                     = { first = 0x1BCA0, last = 0x1BCAF,              description = "Shorthand Format Controls" },
508    ["siddham"]                                     = { first = 0x11580, last = 0x115FF,              description = "Siddham" },
509    ["sinhala"]                                     = { first = 0x00D80, last = 0x00DFF, otf="sinh",  description = "Sinhala" },
510    ["sinhalaarchaicnumbers"]                       = { first = 0x111E0, last = 0x111FF,              description = "Sinhala Archaic Numbers" },
511    ["smallformvariants"]                           = { first = 0x0FE50, last = 0x0FE6F,              description = "Small Form Variants" },
512    ["smallkanaextension"]                          = { first = 0x1B130, last = 0x1B16F,              description = "Small Kana Extension" },
513    ["sogdian"]                                     = { first = 0x10F30, last = 0x10F6F,              description = "Sogdian" },
514    ["sorasompeng"]                                 = { first = 0x110D0, last = 0x110FF,              description = "Sora Sompeng" },
515    ["soyombo"]                                     = { first = 0x11A50, last = 0x11AAF,              description = "Soyombo" },
516    ["spacingmodifierletters"]                      = { first = 0x002B0, last = 0x002FF,              description = "Spacing Modifier Letters" },
517    ["specials"]                                    = { first = 0x0FFF0, last = 0x0FFFF,              description = "Specials" },
518    ["sundanese"]                                   = { first = 0x01B80, last = 0x01BBF,              description = "Sundanese" },
519    ["sundanesesupplement"]                         = { first = 0x01CC0, last = 0x01CCF,              description = "Sundanese Supplement" },
520    ["sunuwar"]                                     = { first = 0x11BC0, last = 0x11BFF,              description = "Sunuwar" },
521    ["superscriptsandsubscripts"]                   = { first = 0x02070, last = 0x0209F,              description = "Superscripts and Subscripts" },
522    ["supplementalarrowsa"]                         = { first = 0x027F0, last = 0x027FF, math = true, description = "Supplemental Arrows-A" },
523    ["supplementalarrowsb"]                         = { first = 0x02900, last = 0x0297F, math = true, description = "Supplemental Arrows-B" },
524    ["supplementalarrowsc"]                         = { first = 0x1F800, last = 0x1F8FF, math = true, description = "Supplemental Arrows-C" },
525    ["supplementalmathematicaloperators"]           = { first = 0x02A00, last = 0x02AFF, math = true, description = "Supplemental Mathematical Operators" },
526    ["supplementalpunctuation"]                     = { first = 0x02E00, last = 0x02E7F,              description = "Supplemental Punctuation" },
527    ["supplementalsymbolsandpictographs"]           = { first = 0x1F900, last = 0x1F9FF,              description = "Supplemental Symbols and Pictographs" },
528    ["supplementaryprivateuseareaa"]                = { first = 0xF0000, last = 0xFFFFF,              description = "Supplementary Private Use Area-A" },
529    ["supplementaryprivateuseareab"]                = { first = 0x100000,last = 0x10FFFF,             description = "Supplementary Private Use Area-B" },
530    ["suttonsignwriting"]                           = { first = 0x1D800, last = 0x1DAAF,              description = "Sutton SignWriting" },
531    ["sylotinagri"]                                 = { first = 0x0A800, last = 0x0A82F, otf="sylo",  description = "Syloti Nagri" },
532    ["symbolsandpictographsextendeda"]              = { first = 0x1FA70, last = 0x1FAFF,              description = "Symbols and Pictographs Extended-A" },
533    ["symbolsforlegacycomputing"]                   = { first = 0x1FB00, last = 0x1FBFF,              description = "Symbols for Legacy Computing" },
534    ["symbolsforlegacycomputingsupplement"]         = { first = 0x1CC00, last = 0x1CEBF,              description = "Symbols for Legacy Computing Supplement" },
535    ["syriac"]                                      = { first = 0x00700, last = 0x0074F, otf="syrc",  description = "Syriac" },
536    ["syriacsupplement"]                            = { first = 0x00860, last = 0x0086F,              description = "Syriac Supplement" },
537    ["tagalog"]                                     = { first = 0x01700, last = 0x0171F, otf="tglg",  description = "Tagalog" },
538    ["tagbanwa"]                                    = { first = 0x01760, last = 0x0177F, otf="tagb",  description = "Tagbanwa" },
539    ["tags"]                                        = { first = 0xE0000, last = 0xE007F,              description = "Tags" },
540    ["taile"]                                       = { first = 0x01950, last = 0x0197F, otf="tale",  description = "Tai Le" },
541    ["taitham"]                                     = { first = 0x01A20, last = 0x01AAF,              description = "Tai Tham" },
542    ["taiviet"]                                     = { first = 0x0AA80, last = 0x0AADF,              description = "Tai Viet" },
543    ["taixuanjingsymbols"]                          = { first = 0x1D300, last = 0x1D35F,              description = "Tai Xuan Jing Symbols" },
544    ["takri"]                                       = { first = 0x11680, last = 0x116CF,              description = "Takri" },
545    ["tamil"]                                       = { first = 0x00B80, last = 0x00BFF, otf="taml",  description = "Tamil" },
546    ["tamilsupplement"]                             = { first = 0x11FC0, last = 0x11FFF,              description = "Tamil Supplement" },
547    ["tangut"]                                      = { first = 0x17000, last = 0x187FF,              description = "Tangut" },
548    ["tangutsupplement"]                            = { first = 0x18D00, last = 0x18D7F,              description = "Tangut Supplement" },
549    ["tangutcomponents"]                            = { first = 0x18800, last = 0x18AFF,              description = "Tangut Components" },
550    ["tangsa"]                                      = { first = 0x16A70, last = 0x16ACF,              description = "Tangsa" },
551    ["telugu"]                                      = { first = 0x00C00, last = 0x00C7F, otf="telu",  description = "Telugu" },
552    ["thaana"]                                      = { first = 0x00780, last = 0x007BF, otf="thaa",  description = "Thaana" },
553    ["thai"]                                        = { first = 0x00E00, last = 0x00E7F, otf="thai",  description = "Thai" },
554    ["tibetan"]                                     = { first = 0x00F00, last = 0x00FFF, otf="tibt",  description = "Tibetan" },
555    ["tifinagh"]                                    = { first = 0x02D30, last = 0x02D7F, otf="tfng",  description = "Tifinagh" },
556    ["tirhuta"]                                     = { first = 0x11480, last = 0x114DF,              description = "Tirhuta" },
557    ["todhri"]                                      = { first = 0x105C0, last = 0x105FF,              description = "Todhri" },
558    ["toto"]                                        = { first = 0x1E290, last = 0x1E2BF,              description = "Toto" },
559    ["transportandmapsymbols"]                      = { first = 0x1F680, last = 0x1F6FF,              description = "Transport and Map Symbols" },
560    ["tulutigalari"]                                = { first = 0x11380, last = 0x113FF,              description = "Tulu-Tigalari" },
561    ["ugaritic"]                                    = { first = 0x10380, last = 0x1039F, otf="ugar",  description = "Ugaritic" },
562    ["unifiedcanadianaboriginalsyllabics"]          = { first = 0x01400, last = 0x0167F, otf="cans",  description = "Unified Canadian Aboriginal Syllabics" },
563    ["unifiedcanadianaboriginalsyllabicsextended"]  = { first = 0x018B0, last = 0x018FF,              description = "Unified Canadian Aboriginal Syllabics Extended" },
564    ["unifiedcanadianaboriginalsyllabicsextendeda"] = { first = 0x11AB0, last = 0x11ABF,              description = "Unified Canadian Aboriginal Syllabics Extended-A" },
565    ["uppercasebold"]                               = { first = 0x1D400, last = 0x1D419, math = true },
566    ["uppercaseboldfraktur"]                        = { first = 0x1D56C, last = 0x1D585, math = true },
567    ["uppercasebolditalic"]                         = { first = 0x1D468, last = 0x1D481, math = true, italic = true },
568    ["uppercaseboldscript"]                         = { first = 0x1D4D0, last = 0x1D4E9, math = true, italic = true },
569    ["uppercasedoublestruck"]                       = { first = 0x1D538, last = 0x1D551, math = true }, -- gaps are filled in elsewhere
570    ["uppercasefraktur"]                            = { first = 0x1D504, last = 0x1D51D, math = true },
571    ["uppercasegreekbold"]                          = { first = 0x1D6A8, last = 0x1D6C1, math = true },
572    ["uppercasegreekbolditalic"]                    = { first = 0x1D71C, last = 0x1D735, math = true, italic = true },
573    ["uppercasegreekitalic"]                        = { first = 0x1D6E2, last = 0x1D6FB, math = true, italic = true },
574    ["uppercasegreeknormal"]                        = { first = 0x00391, last = 0x003AA, math = true },
575    ["uppercasegreeksansserifbold"]                 = { first = 0x1D756, last = 0x1D76F, math = true },
576    ["uppercasegreeksansserifbolditalic"]           = { first = 0x1D790, last = 0x1D7A9, math = true, italic = true },
577    ["uppercaseitalic"]                             = { first = 0x1D434, last = 0x1D44D, math = true, italic = true },
578    ["uppercasemonospace"]                          = { first = 0x1D670, last = 0x1D689, math = true },
579    ["uppercasenormal"]                             = { first = 0x00041, last = 0x0005A, math = true },
580    ["uppercasesansserifbold"]                      = { first = 0x1D5D4, last = 0x1D5ED, math = true },
581    ["uppercasesansserifbolditalic"]                = { first = 0x1D63C, last = 0x1D655, math = true, italic = true },
582    ["uppercasesansserifitalic"]                    = { first = 0x1D608, last = 0x1D621, math = true, italic = true },
583    ["uppercasesansserifnormal"]                    = { first = 0x1D5A0, last = 0x1D5B9, math = true },
584    ["uppercasescript"]                             = { first = 0x1D49C, last = 0x1D4B5, math = true, italic = true },
585    ["vai"]                                         = { first = 0x0A500, last = 0x0A63F,              description = "Vai" },
586    ["variationselectors"]                          = { first = 0x0FE00, last = 0x0FE0F,              description = "Variation Selectors" },
587    ["variationselectorssupplement"]                = { first = 0xE0100, last = 0xE01EF,              description = "Variation Selectors Supplement" },
588    ["vedicextensions"]                             = { first = 0x01CD0, last = 0x01CFF,              description = "Vedic Extensions" },
589    ["verticalforms"]                               = { first = 0x0FE10, last = 0x0FE1F,              description = "Vertical Forms" },
590    ["vithkuqi"]                                    = { first = 0x10570, last = 0x105BF,              description = "Vithkuqi" },
591    ["wancho"]                                      = { first = 0x1E2C0, last = 0x1E2FF,              description = "Wancho" },
592    ["warangciti"]                                  = { first = 0x118A0, last = 0x118FF,              description = "Warang Citi" },
593    ["yezidi"]                                      = { first = 0x10E80, last = 0x10EBF,              description = "Yezidi" },
594    ["yijinghexagramsymbols"]                       = { first = 0x04DC0, last = 0x04DFF, otf="yi",    description = "Yijing Hexagram Symbols" },
595    ["yiradicals"]                                  = { first = 0x0A490, last = 0x0A4CF, otf="yi",    description = "Yi Radicals" },
596    ["yisyllables"]                                 = { first = 0x0A000, last = 0x0A48F, otf="yi",    description = "Yi Syllables" },
597    ["zanabazarsquare"]                             = { first = 0x11A00, last = 0x11A4F,              description = "Zanabazar Square" },
598    ["znamennymusicalnotation"]                     = { first = 0x1CF00, last = 0x1CFCF,              description = "Znamenny Musical Notation" },
599
600    -- The calligraphic shapes are different from script shapes but don't have a dedicated
601    -- range so we make one. An example of a font that has them is Lucida but we also drop
602    -- them into other fonts.
603
604    ["lowercasecalligraphic"]     = { first = 0x100000, last = 0x100019, math = true },
605    ["uppercasecalligraphic"]     = { first = 0x100020, last = 0x100039, math = true },
606    ["lowercaseboldcalligraphic"] = { first = 0x100040, last = 0x100059, math = true },
607    ["uppercaseboldcalligraphic"] = { first = 0x100060, last = 0x100079, math = true },
608
609    -- At the same time we reserve(d) some extra greek alphabets and surprise, a font like
610    -- stixtwo actually has them, so we enable them now.
611
612    ["lowercasesansgreek"]        = { first = 0x100080, last = 0x100099, math = true },
613    ["uppercasesansgreek"]        = { first = 0x1000A0, last = 0x1000B9, math = true },
614    ["lowercaseitalicsansgreek"]  = { first = 0x1000C0, last = 0x1000D9, math = true },
615    ["uppercaseitalicsansgreek"]  = { first = 0x1000E0, last = 0x1000F9, math = true },
616
617    -- Maybe this one also makes sense, although the fact that all these extra alphabets
618    -- were not made part of unicode math (combined with the holes in alphabets) indicates
619    -- that usage was not perceived.
620
621    ["lowercaseblackboarditalic"] = { first = 0x100100, last = 0x100119, math = true },
622    ["uppercaseblackboarditalic"] = { first = 0x100120, last = 0x100139, math = true },
623
624    -- Anyway, all permutations at some point might show up, but it might take decades
625    -- before the tex math dev community catches on. In the end it is upto microsoft to
626    -- take the lead, just as with other unicode math and fonts.
627    --
628    -- Also, it would be a bit pathetic to add more alphabets natively to unicode at
629    -- the one hand while accepting these gaps in existing alphabets and not having a
630    -- native upright greek math alphabet either in order to distringuish from greek
631    -- text. Either we go semantic ore go shapes, but a mix is only confusing.
632
633}
634
635-- moved from math-act.lua to here:
636
637-- operators    : 0x02200
638-- symbolsa     : 0x02701
639-- symbolsb     : 0x02901
640-- supplemental : 0x02A00
641
642blocks.lowercaseitalic.gaps = {
643    [0x1D455] = 0x0210E, -- ℎ h
644}
645
646blocks.uppercasescript.gaps = {
647    [0x1D49D] = 0x0212C, -- ℬ script B
648    [0x1D4A0] = 0x02130, -- ℰ script E
649    [0x1D4A1] = 0x02131, -- ℱ script F
650    [0x1D4A3] = 0x0210B, -- ℋ script H
651    [0x1D4A4] = 0x02110, -- ℐ script I
652    [0x1D4A7] = 0x02112, -- ℒ script L
653    [0x1D4A8] = 0x02133, -- ℳ script M
654    [0x1D4AD] = 0x0211B, -- ℛ script R
655}
656
657blocks.lowercasescript.gaps = {
658    [0x1D4BA] = 0x0212F, -- ℯ script e
659    [0x1D4BC] = 0x0210A, -- ℊ script g
660    [0x1D4C4] = 0x02134, -- ℴ script o
661}
662
663blocks.uppercasefraktur.gaps = {
664    [0x1D506] = 0x0212D, -- ℭ fraktur C
665    [0x1D50B] = 0x0210C, -- ℌ fraktur H
666    [0x1D50C] = 0x02111, -- ℑ fraktur I
667    [0x1D515] = 0x0211C, -- ℜ fraktur R
668    [0x1D51D] = 0x02128, -- ℨ fraktur Z
669}
670
671blocks.uppercasedoublestruck.gaps = {
672    [0x1D53A] = 0x02102, -- ℂ bb C
673    [0x1D53F] = 0x0210D, -- ℍ bb H
674    [0x1D545] = 0x02115, -- ℕ bb N
675    [0x1D547] = 0x02119, -- ℙ bb P
676    [0x1D548] = 0x0211A, -- ℚ bb Q
677    [0x1D549] = 0x0211D, -- ℝ bb R
678    [0x1D551] = 0x02124, -- ℤ bb Z
679}
680
681characters.blocks = blocks
682
683function characters.blockrange(name)
684    local b = blocks[name]
685    if b then
686        return b.first, b.last
687    else
688        return 0, 0
689    end
690end
691
692setmetatableindex(blocks, function(t,k) -- we could use an intermediate table if called often
693    return k and rawget(t,lower(gsub(k,"[^a-zA-Z]","")))
694end)
695
696local otfscripts      = utilities.storage.allocate()
697characters.otfscripts = otfscripts
698
699setmetatableindex(otfscripts,function(t,unicode)
700    for k, v in next, blocks do
701        local first = v.first
702        local last  = v.last
703        if unicode >= first and unicode <= last then
704            local script = v.otf or "dflt"
705            for u=first,last do
706                t[u] = script
707            end
708            return script
709        end
710    end
711    -- pretty slow when we're here
712    t[unicode] = "dflt"
713    return "dflt"
714end)
715
716local splitter1 = lpeg.splitat(S(":-"))
717local splitter2 = lpeg.splitat(S(" +-"),true)
718
719function characters.getrange(name,expression) -- used in font fallback definitions (name or range)
720    local range = rawget(blocks,lower(gsub(name,"[^a-zA-Z0-9]","")))
721    if range then
722        return range.first, range.last, range.description, range.gaps
723    end
724    name = gsub(name,'"',"0x") -- goodie: tex hex notation
725    local start, stop
726    if expression then
727        local n = tonumber(name)
728        if n then
729            return n, n, nil
730        else
731            local first, rest = lpegmatch(splitter2,name)
732            local range = rawget(blocks,lower(gsub(first,"[^a-zA-Z0-9]","")))
733            if range then
734                local s = loadstring("return 0 " .. rest)
735                if type(s) == "function" then
736                    local d = s()
737                    if type(d) == "number" then
738                        return range.first + d, range.last + d, nil
739                    end
740                end
741            end
742        end
743    end
744    local start, stop = lpegmatch(splitter1,name)
745    if start and stop then
746        start = tonumber(start,16) or tonumber(start)
747        stop  = tonumber(stop, 16) or tonumber(stop)
748        if start and stop then
749            return start, stop, nil
750        end
751    end
752    local slot = tonumber(name,16) or tonumber(name)
753    return slot, slot, nil
754end
755
756-- print(characters.getrange("lowercaseitalic + 123",true))
757-- print(characters.getrange("lowercaseitalic + 124",true))
758
759local categorytags = allocate {
760    lu = "Letter Uppercase",
761    ll = "Letter Lowercase",
762    lt = "Letter Titlecase",
763    lm = "Letter Modifier",
764    lo = "Letter Other",
765    mn = "Mark Nonspacing",
766    mc = "Mark Spacing Combining",
767    me = "Mark Enclosing",
768    nd = "Number Decimal Digit",
769    nl = "Number Letter",
770    no = "Number Other",
771    pc = "Punctuation Connector",
772    pd = "Punctuation Dash",
773    ps = "Punctuation Open",
774    pe = "Punctuation Close",
775    pi = "Punctuation Initial Quote",
776    pf = "Punctuation Final Quote",
777    po = "Punctuation Other",
778    sm = "Symbol Math",
779    sc = "Symbol Currency",
780    sk = "Symbol Modifier",
781    so = "Symbol Other",
782    zs = "Separator Space",
783    zl = "Separator Line",
784    zp = "Separator Paragraph",
785    cc = "Other Control",
786    cf = "Other Format",
787    cs = "Other Surrogate",
788    co = "Other Private Use",
789    cn = "Other Not Assigned",
790}
791
792local detailtags = allocate {
793    sl = "small letter",
794    bl = "big letter",
795    im = "iteration mark",
796    pm = "prolonged sound mark"
797}
798
799characters.categorytags = categorytags
800characters.detailtags   = detailtags
801
802-- sounds : voiced unvoiced semivoiced
803
804--~ special   : cf (softhyphen) zs (emspace)
805--~ characters: ll lm lo lt lu mn nl no pc pd pe pf pi po ps sc sk sm so
806
807local is_character = allocate ( tohash {
808    "lu","ll","lt","lm","lo",
809    "nd","nl","no",
810    "mn",
811    "nl","no",
812    "pc","pd","ps","pe","pi","pf","po",
813    "sm","sc","sk","so"
814} )
815
816local is_letter = allocate ( tohash {
817    "ll","lm","lo","lt","lu"
818} )
819
820local is_command = allocate ( tohash {
821    "cf","zs"
822} )
823
824local is_spacing = allocate ( tohash {
825    "zs", "zl","zp",
826} )
827
828local is_mark = allocate ( tohash {
829    "mn", "ms", -- "mn", "mc",
830} )
831
832local is_punctuation = allocate ( tohash {
833    "pc", "pd", "ps", "pe", "pi", "pf", "po",
834} )
835
836local is_hyphenator = allocate ( tohash {
837    "pd",
838} )
839
840local is_symbol = allocate ( tohash {
841    "sm", "sc", "sk", "so",
842} )
843
844local is_nothing = allocate ( tohash {
845    "pc", "pd", "ps", "pe", "pi", "pf", "po",
846    "zs", "zl","zp",
847} )
848
849local can_have_space = allocate ( tohash {
850    "lu", "ll", "lt", "lm", "lo", -- letters
851 -- "mn", "mc", "me",             -- marks
852    "nd", "nl", "no",             -- numbers
853    "ps", "pi",                   -- initial
854 -- "pe", "pf",                   -- final
855 -- "pc", "pd", "po",             -- punctuation
856    "sm", "sc", "sk", "so",       -- symbols
857 -- "zs", "zl", "zp",             -- separators
858 -- "cc", "cf", "cs", "co", "cn", -- others
859} )
860
861
862-- to be redone: store checked characters
863
864characters.is_character   = is_character
865characters.is_letter      = is_letter
866characters.is_command     = is_command
867characters.is_spacing     = is_spacing
868characters.is_mark        = is_mark
869characters.is_punctuation = is_punctuation
870characters.is_hyphenator  = is_hyphenator
871characters.is_symbol      = is_symbol
872characters.is_nothing     = is_nothing
873characters.can_have_space = can_have_space
874
875local mti = function(t,k)
876    if type(k) == "number" then
877        local c = data[k].category
878        return c and rawget(t,c)
879    else
880        -- avoid auto conversion in data.characters lookups
881    end
882end
883
884setmetatableindex(characters.is_character,   mti)
885setmetatableindex(characters.is_letter,      mti)
886setmetatableindex(characters.is_command,     mti)
887setmetatableindex(characters.is_spacing,     mti)
888setmetatableindex(characters.is_punctuation, mti)
889setmetatableindex(characters.is_hyphenator,  mti)
890setmetatableindex(characters.is_symbol,      mti)
891setmetatableindex(characters.can_have_space, mti)
892
893-- todo: also define callers for the above
894
895-- linebreak: todo: hash
896--
897-- normative   : BK CR LF CM SG GL CB SP ZW NL WJ JL JV JT H2 H3
898-- informative : XX OP CL CP QU NS EX SY IS PR PO NU AL ID IN HY BB BA SA AI B2 HL CJ RI
899--
900-- U+03400..U+04DBF ID
901-- U+04E00..U+09FFF ID
902-- U+0F900..U+0FAFF ID
903-- U+20000..U+2FFFD ID
904-- U+30000..U+3FFFD ID
905-- U+1F000..U+1FAFF ID
906-- U+1FC00..U+1FFFD ID
907-- U+020A0..U+020CF PR
908
909characters.linebreaks = allocate {
910
911    -- non-tailorable line breaking classes
912
913    ["bk"]  = "mandatory break",                             -- nl, ps : cause a line break (after)
914    ["cr"]  = "carriage return",                             -- cr : cause a line break (after), except between cr and lf
915    ["lf"]  = "line feed",                                   -- lf : cause a line break (after)
916    ["cm"]  = "combining mark",                              -- combining marks, control codes : prohibit a line break between the character and the preceding character
917    ["nl"]  = "next line",                                   -- nel : cause a line break (after)
918    ["sg"]  = "surrogate",                                   -- surrogates :do not occur in well-formed text
919    ["wj"]  = "word joiner",                                 -- wj : prohibit line breaks before and after
920    ["zw"]  = "zero width space",                            -- zwsp : provide a break opportunity
921    ["gl"]  = "non-breaking (glue)",                         -- cgj, nbsp, zwnbsp : prohibit line breaks before and after
922    ["sp"]  = "space",                                       -- space : enable indirect line breaks
923    ["zwj"] = "zero width joiner",                           -- prohibit line breaks within joiner sequences
924
925    -- break opportunities
926
927    ["b2"] = "break opportunity before and after",           -- em dash : provide a line break opportunity before and after the character
928    ["ba"] = "break after",                                  -- spaces, hyphens : generally provide a line break opportunity after the character
929    ["bb"] = "break before",                                 -- punctuation used in dictionaries : generally provide a line break opportunity before the character
930    ["hy"] = "hyphen",                                       -- hyphen-minus : provide a line break opportunity after the character, except in numeric context
931    ["cb"] = "contingent break opportunity",                 -- inline objects : provide a line break opportunity contingent on additional information
932
933    -- characters prohibiting certain breaks
934
935    ["cl"] = "close punctuation",                            -- “}”, “❳”, “⟫” etc. : prohibit line breaks before
936    ["cp"] = "close parenthesis",                            -- “)”, “]” : prohibit line breaks before
937    ["ex"] = "exclamation/interrogation",                    -- “!”, “?”, etc. : prohibit line breaks before
938    ["in"] = "inseparable",                                  -- leaders : allow only indirect line breaks between pairs
939    ["ns"] = "nonstarter",                                   -- “‼”, “‽”, “⁇”, “⁉”, etc. : allow only indirect line breaks before
940    ["op"] = "open punctuation",                             -- “(“, “[“, “{“, etc. : prohibit line breaks after
941    ["qu"] = "quotation",                                    -- quotation marks : act like they are both opening and closing
942
943    -- numeric context
944
945    ["is"] = "infix numeric separator",                      -- . , : prevent breaks after any and before numeric
946    ["nu"] = "numeric",                                      -- digits : form numeric expressions for line breaking purposes
947    ["po"] = "postfix numeric",                              -- %, ¢ : do not break following a numeric expression
948    ["pr"] = "prefix numeric",                               -- $, £, ¥, etc. : do not break in front of a numeric expression
949    ["sy"] = "symbols allowing break after",                 -- / : prevent a break before, and allow a break after
950
951    -- other characters
952
953    ["ai"] = "ambiguous (alphabetic or ideographic)",        -- characters with ambiguous east asian width : act like al when the resolved eaw is n; otherwise, act as id
954    ["ak"] = "aksara",                                       -- Consonants
955    ["al"] = "alphabetic",                                   -- alphabets and regular symbols : are alphabetic characters or symbols that are used with alphabetic characters
956    ["ap"] = "aksara pre-pase",                              -- pre-base repha
957    ["as"] = "ksara start",                                  -- independent vowels
958    ["cj"] = "conditional japanese starter",                 -- small kana : treat as ns or id for strict or normal breaking.
959    ["eb"] = "emoji base",                                   -- all emoji allowing modifiers, do not break from following emoji modifier
960    ["em"] = "emoji modifier",                               -- skin tone modifiers, do not break from preceding emoji base
961    ["h2"] = "hangul lv syllable",                           -- hangul : form korean syllable blocks
962    ["h3"] = "hangul lvt syllable",                          -- hangul : form korean syllable blocks
963    ["hl"] = "hebrew letter",                                -- hebrew : do not break around a following hyphen; otherwise act as alphabetic
964    ["id"] = "ideographic",                                  -- ideographs : break before or after, except in some numeric context
965    ["jl"] = "hangul l jamo",                                -- conjoining jamo : form korean syllable blocks
966    ["jt"] = "hangul t jamo",                                -- conjoining jamo : form korean syllable blocks
967    ["jv"] = "hangul v jamo",                                -- conjoining jamo : form korean syllable blocks
968    ["ri"] = "regional indicator",                           -- regional indicator symbol letter a .. z : keep together, break before and after from others
969    ["sa"] = "complex context dependent (south east asian)", -- south east asian: thai, lao, khmer : provide a line break opportunity contingent on additional, language-specific context analysis
970    ["vf"] = "virama final",                                 -- Viramas for final consonants
971    ["vi"] = "virama",                                       -- Conjoining viramas
972    ["xx"] = "unknown",                                      -- most unassigned, private-use : have as yet unknown line breaking behavior or unassigned code positions
973
974}
975
976-- east asian width:
977--
978-- N A H W F Na
979
980characters.bidi = allocate {
981    l   = "Left-to-Right",
982    lre = "Left-to-Right Embedding",
983    lro = "Left-to-Right Override",
984    r   = "Right-to-Left",
985    al  = "Right-to-Left Arabic",
986    rle = "Right-to-Left Embedding",
987    rlo = "Right-to-Left Override",
988    pdf = "Pop Directional Format",
989    en  = "European Number",
990    es  = "European Number Separator",
991    et  = "European Number Terminator",
992    an  = "Arabic Number",
993    cs  = "Common Number Separator",
994    nsm = "Non-Spacing Mark",
995    bn  = "Boundary Neutral",
996    b   = "Paragraph Separator",
997    s   = "Segment Separator",
998    ws  = "Whitespace",
999    on  = "Other Neutrals",
1000}
1001
1002-- At this point we assume that the big data table is loaded. From this table we
1003-- derive a few more.
1004
1005if not characters.fallbacks then
1006
1007    characters.fallbacks = allocate {
1008        [0x0308] = 0x00A8, [0x00A8] = 0x0308, -- dieresiscmb      dieresis
1009        [0x0304] = 0x00AF, [0x00AF] = 0x0304, -- macroncmb        macron
1010        [0x0301] = 0x00B4, [0x00B4] = 0x0301, -- acutecomb        acute
1011        [0x0327] = 0x00B8, [0x00B8] = 0x0327, -- cedillacmb       cedilla
1012        [0x0302] = 0x02C6, [0x02C6] = 0x0302, -- circumflexcmb    circumflex
1013        [0x030C] = 0x02C7, [0x02C7] = 0x030C, -- caroncmb         caron
1014        [0x0306] = 0x02D8, [0x02D8] = 0x0306, -- brevecmb         breve
1015        [0x0307] = 0x02D9, [0x02D9] = 0x0307, -- dotaccentcmb     dotaccent
1016        [0x030A] = 0x02DA, [0x02DA] = 0x030A, -- ringcmb          ring
1017        [0x0328] = 0x02DB, [0x02DB] = 0x0328, -- ogonekcmb        ogonek
1018        [0x0303] = 0x02DC, [0x02DC] = 0x0303, -- tildecomb        tilde
1019        [0x030B] = 0x02DD, [0x02DD] = 0x030B, -- hungarumlautcmb  hungarumlaut
1020        [0x0305] = 0x203E, [0x203E] = 0x0305, -- overlinecmb      overline
1021        [0x0300] = 0x0060, [0x0060] = 0x0333, -- gravecomb        grave
1022    }
1023
1024    -- not done (would mess up mapping):
1025    --
1026    -- 0X0301/0X0384 0X0314/0X1FFE 0X0313/0X1FBD 0X0313/0X1FBF 0X0342/0X1FC0
1027    -- 0X3099/0X309B 0X309A/0X309C 0X0333/0X2017 0X0345/0X037A
1028
1029end
1030
1031if storage then -- in case we extend
1032    storage.register("characters/fallbacks", characters.fallbacks, "characters.fallbacks") -- accents and such
1033end
1034
1035characters.directions  = { }
1036
1037setmetatableindex(characters.directions,function(t,k)
1038    local d = data[k]
1039    if d then
1040        local v = d.direction
1041        if v then
1042            t[k] = v
1043            return v
1044        end
1045    end
1046    t[k] = false -- maybe 'l'
1047    return false
1048end)
1049
1050characters.mirrors  = { }
1051
1052setmetatableindex(characters.mirrors,function(t,k)
1053    local d = data[k]
1054    if d then
1055        local v = d.mirror
1056        if v then
1057            t[k] = v
1058            return v
1059        end
1060    end
1061    t[k] = false
1062    return false
1063end)
1064
1065characters.textclasses  = { }
1066
1067setmetatableindex(characters.textclasses,function(t,k)
1068    local d = data[k]
1069    if d then
1070        local v = d.textclass
1071        if v then
1072            t[k] = v
1073            return v
1074        end
1075    end
1076    t[k] = false
1077    return false
1078end)
1079
1080-- Next comes a whole series of helper methods. These are (will be) part of the
1081-- official API.
1082
1083-- we could make them virtual: characters.contextnames[n]
1084
1085function characters.contextname(n) return data[n] and data[n].contextname or "" end
1086function characters.adobename  (n) return data[n] and data[n].adobename   or "" end
1087function characters.description(n) return data[n] and data[n].description or "" end
1088-------- characters.category   (n) return data[n] and data[n].category    or "" end
1089
1090function characters.category(n,verbose)
1091    local c = data[n].category
1092    if not c then
1093        return ""
1094    elseif verbose then
1095        return categorytags[c]
1096    else
1097        return c
1098    end
1099end
1100
1101-- -- some day we will make a table .. not that many calls to utfchar
1102--
1103-- local utfchar = utf.char
1104-- local utfbyte = utf.byte
1105-- local utfbytes = { }
1106-- local utfchars = { }
1107--
1108-- table.setmetatableindex(utfbytes,function(t,k) local v = utfchar(k) t[k] = v return v end)
1109-- table.setmetatableindex(utfchars,function(t,k) local v = utfbyte(k) t[k] = v return v end)
1110
1111local function toutfstring(s)
1112    if type(s) == "table" then
1113        return utfchar(unpack(s)) -- concat { utfchar( unpack(s) ) }
1114    else
1115        return utfchar(s)
1116    end
1117end
1118
1119utf.tostring = toutfstring -- in lmtx just utf.tostring = utfchar
1120
1121local categories = allocate()  characters.categories = categories -- lazy table
1122
1123setmetatableindex(categories, function(t,u) if u then local c = data[u] c = c and c.category or u t[u] = c return c end end)
1124
1125-- todo: overloads (these register directly in the tables as number and string) e.g. for greek
1126-- todo: for string do a numeric lookup in the table itself
1127
1128local lccodes = allocate()  characters.lccodes = lccodes -- lazy table
1129local uccodes = allocate()  characters.uccodes = uccodes -- lazy table
1130local shcodes = allocate()  characters.shcodes = shcodes -- lazy table
1131local fscodes = allocate()  characters.fscodes = fscodes -- lazy table
1132
1133setmetatableindex(lccodes, function(t,u) if u then local c = data[u] c = c and c.lccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1134setmetatableindex(uccodes, function(t,u) if u then local c = data[u] c = c and c.uccode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1135setmetatableindex(shcodes, function(t,u) if u then local c = data[u] c = c and c.shcode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1136setmetatableindex(fscodes, function(t,u) if u then local c = data[u] c = c and c.fscode or (type(u) == "string" and utfbyte(u)) or u t[u] = c return c end end)
1137
1138local lcchars = allocate()  characters.lcchars = lcchars -- lazy table
1139local ucchars = allocate()  characters.ucchars = ucchars -- lazy table
1140local shchars = allocate()  characters.shchars = shchars -- lazy table
1141local fschars = allocate()  characters.fschars = fschars -- lazy table
1142
1143setmetatableindex(lcchars, function(t,u) if u then local c = data[u] c = c and c.lccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1144setmetatableindex(ucchars, function(t,u) if u then local c = data[u] c = c and c.uccode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1145setmetatableindex(shchars, function(t,u) if u then local c = data[u] c = c and c.shcode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1146setmetatableindex(fschars, function(t,u) if u then local c = data[u] c = c and c.fscode c = c and toutfstring(c) or (type(u) == "number" and utfchar(u)) or u t[u] = c return c end end)
1147
1148local decomposed = allocate()  characters.decomposed = decomposed   -- lazy table
1149local specials   = allocate()  characters.specials   = specials     -- lazy table
1150
1151setmetatableindex(decomposed, function(t,u) -- either a table or false
1152    if u then
1153        local c = data[u]
1154        local s = c and c.decomposed or false -- could fall back to specials
1155        t[u] = s
1156        return s
1157    end
1158end)
1159
1160setmetatableindex(specials, function(t,u) -- either a table or false
1161    if u then
1162        local c = data[u]
1163        local s = c and c.specials or false
1164        t[u] = s
1165        return s
1166    end
1167end)
1168
1169local specialchars = allocate()  characters.specialchars = specialchars -- lazy table
1170local descriptions = allocate()  characters.descriptions = descriptions -- lazy table
1171local synonyms     = allocate()  characters.synonyms     = synonyms     -- lazy table
1172
1173setmetatableindex(specialchars, function(t,u)
1174    if u then
1175        local c = data[u]
1176        local s = c and c.specials
1177        if s then
1178            local tt  = { }
1179            local ttn = 0
1180            for i=2,#s do
1181                local si = s[i]
1182                local c = data[si]
1183                if is_letter[c.category] then
1184                    ttn = ttn + 1
1185                    tt[ttn] = utfchar(si)
1186                end
1187            end
1188            c = concat(tt)
1189            t[u] = c
1190            return c
1191        else
1192            if type(u) == "number" then
1193                u = utfchar(u)
1194            end
1195            t[u] = u
1196            return u
1197        end
1198    end
1199end)
1200
1201setmetatableindex(descriptions, function(t,k)
1202    -- 0.05 - 0.10 sec
1203    for u, c in next, data do
1204        local d = c.description
1205        if d then
1206            if find(d," ",1,true) then
1207                d = gsub(d," ","")
1208            end
1209            d = lower(d)
1210            t[d] = u
1211        end
1212    end
1213    local d = rawget(t,k)
1214    if not d then
1215        t[k] = k
1216    end
1217    return d
1218end)
1219
1220setmetatableindex(synonyms, function(t,k)
1221    for u, c in next, data do
1222        local s = c.synonyms
1223        if s then
1224            if find(s," ",1,true) then
1225                s = gsub(s," ","")
1226            end
1227         -- s = lower(s) -- is already lowercase
1228            t[s] = u
1229        end
1230    end
1231    local s = rawget(t,k)
1232    if not s then
1233        t[s] = s
1234    end
1235    return s
1236end)
1237
1238function characters.unicodechar(asked)
1239    local n = tonumber(asked)
1240    if n then
1241        return n
1242    elseif type(asked) == "string" then
1243        return descriptions[asked] or descriptions[gsub(asked," ","")]
1244    end
1245end
1246
1247-- function characters.lower(str)
1248--     local new, n = { }, 0
1249--     for u in utfvalues(str) do
1250--         n = n + 1
1251--         new[n] = lcchars[u]
1252--     end
1253--     return concat(new)
1254-- end
1255--
1256-- function characters.upper(str)
1257--     local new, n = { }, 0
1258--     for u in utfvalues(str) do
1259--         n = n + 1
1260--         new[n] = ucchars[u]
1261--     end
1262--     return concat(new)
1263-- end
1264--
1265-- function characters.shaped(str)
1266--     local new, n = { }, 0
1267--     for u in utfvalues(str) do
1268--         n = n + 1
1269--         new[n] = shchars[u]
1270--     end
1271--     return concat(new)
1272-- end
1273
1274----- tolower = Cs((utf8byte/lcchars)^0)
1275----- toupper = Cs((utf8byte/ucchars)^0)
1276----- toshape = Cs((utf8byte/shchars)^0)
1277
1278local tolower = Cs((utf8character/lcchars)^0) -- no need to check spacing
1279local toupper = Cs((utf8character/ucchars)^0) -- no need to check spacing
1280local toshape = Cs((utf8character/shchars)^0) -- no need to check spacing
1281
1282lpegpatterns.tolower = tolower -- old ones ... will be overloaded
1283lpegpatterns.toupper = toupper -- old ones ... will be overloaded
1284lpegpatterns.toshape = toshape -- old ones ... will be overloaded
1285
1286-- function characters.lower (str) return lpegmatch(tolower,str) end
1287-- function characters.upper (str) return lpegmatch(toupper,str) end
1288-- function characters.shaped(str) return lpegmatch(toshape,str) end
1289
1290--     local superscripts = allocate()   characters.superscripts = superscripts
1291--     local subscripts   = allocate()   characters.subscripts   = subscripts
1292
1293--     if storage then
1294--         storage.register("characters/superscripts", superscripts, "characters.superscripts")
1295--         storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1296--     end
1297
1298-- end
1299
1300if not characters.splits then
1301
1302    local char   = allocate()
1303    local compat = allocate()
1304
1305    local splits = {
1306        char   = char,
1307        compat = compat,
1308    }
1309
1310    characters.splits = splits
1311
1312    -- [0x013F] = { 0x004C, 0x00B7 }
1313    -- [0x0140] = { 0x006C, 0x00B7 }
1314
1315    for unicode, data in next, characters.data do
1316        local specials = data.specials
1317        if specials and #specials > 2 then
1318            local kind = specials[1]
1319            if kind == "compat" then
1320                compat[unicode] = { unpack(specials,2) }
1321            elseif kind == "char" or kind == "with" then -- width added
1322                char  [unicode] = { unpack(specials,2) }
1323            end
1324        end
1325    end
1326
1327    if storage then
1328        storage.register("characters/splits", splits, "characters.splits")
1329    end
1330
1331end
1332
1333if not characters.lhash then
1334
1335    local lhash = allocate()   characters.lhash = lhash -- nil if no conversion
1336    local uhash = allocate()   characters.uhash = uhash -- nil if no conversion
1337    local shash = allocate()   characters.shash = shash -- nil if no conversion
1338
1339    for k, v in next, characters.data do
1340     -- if k < 0x11000 then
1341            local l = v.lccode
1342            if l then
1343                -- we have an uppercase
1344                if type(l) == "number" then
1345                    lhash[utfchar(k)] = utfchar(l)
1346                elseif #l == 2 then
1347                    lhash[utfchar(k)] = utfchar(l[1]) .. utfchar(l[2])
1348             -- else
1349             --     inspect(v)
1350                end
1351            else
1352                local u = v.uccode
1353                if u then
1354                    -- we have an lowercase
1355                    if type(u) == "number" then
1356                        uhash[utfchar(k)] = utfchar(u)
1357                    elseif #u == 2 then
1358                        uhash[utfchar(k)] = utfchar(u[1]) .. utfchar(u[2])
1359                 -- else
1360                 --     inspect(v)
1361                    end
1362                end
1363            end
1364            local s = v.shcode
1365            if s then
1366                if type(s) == "number" then
1367                    shash[utfchar(k)] = utfchar(s)
1368                elseif #s == 2 then
1369                    shash[utfchar(k)] = utfchar(s[1]) .. utfchar(s[2])
1370             -- else
1371             --     inspect(v)
1372                end
1373            end
1374     -- end
1375    end
1376
1377    if storage then
1378        storage.register("characters/lhash", lhash, "characters.lhash")
1379        storage.register("characters/uhash", uhash, "characters.uhash")
1380        storage.register("characters/shash", shash, "characters.shash")
1381    end
1382
1383end
1384
1385local lhash = characters.lhash mark(lhash)
1386local uhash = characters.uhash mark(uhash)
1387local shash = characters.shash mark(shash)
1388
1389local utf8lowercharacter = utfchartabletopattern(lhash) / lhash
1390local utf8uppercharacter = utfchartabletopattern(uhash) / uhash
1391local utf8shapecharacter = utfchartabletopattern(shash) / shash
1392
1393local utf8lower = Cs((utf8lowercharacter + utf8character)^0)
1394local utf8upper = Cs((utf8uppercharacter + utf8character)^0)
1395local utf8shape = Cs((utf8shapecharacter + utf8character)^0)
1396
1397lpegpatterns.utf8lowercharacter = utf8lowercharacter -- one character
1398lpegpatterns.utf8uppercharacter = utf8uppercharacter -- one character
1399lpegpatterns.utf8shapecharacter = utf8shapecharacter -- one character
1400
1401lpegpatterns.utf8lower = utf8lower -- string
1402lpegpatterns.utf8upper = utf8upper -- string
1403lpegpatterns.utf8shape = utf8shape -- string
1404
1405function characters.lower (str) return str and lpegmatch(utf8lower,str) or "" end
1406function characters.upper (str) return str and lpegmatch(utf8upper,str) or "" end
1407function characters.shaped(str) return str and lpegmatch(utf8shape,str) or "" end
1408
1409lpeg.setutfcasers(characters.lower,characters.upper)
1410
1411-- local str = [[
1412--     ÀÁÂÃÄÅàáâãäå àáâãäåàáâãäå ÀÁÂÃÄÅÀÁÂÃÄÅ AAAAAAaaaaaa
1413--     ÆÇæç         æçæç         ÆÇÆÇ         AECaec
1414--     ÈÉÊËèéêë     èéêëèéêë     ÈÉÊËÈÉÊË     EEEEeeee
1415--     ÌÍÎÏÞìíîïþ   ìíîïþìíîïþ   ÌÍÎÏÞÌÍÎÏÞ   IIIIÞiiiiþ
1416--     Ðð           ðð           ÐÐ           Ðð
1417--     Ññ           ññ           ÑÑ           Nn
1418--     ÒÓÔÕÖòóôõö   òóôõöòóôõö   ÒÓÔÕÖÒÓÔÕÖ   OOOOOooooo
1419--     Øø           øø           ØØ           Oo
1420--     ÙÚÛÜùúûü     ùúûüùúûü     ÙÚÛÜÙÚÛÜ     UUUUuuuu
1421--     Ýýÿ          ýýÿ          ÝÝŸ          Yyy
1422--     ß            ß            SS           ss
1423--     Ţţ           ţţ           ŢŢ           Tt
1424-- ]]
1425--
1426-- local lower  = characters.lower   print(lower(str))
1427-- local upper  = characters.upper   print(upper(str))
1428-- local shaped = characters.shaped  print(shaped(str))
1429--
1430-- local c, n = os.clock(), 10000
1431-- for i=1,n do lower(str) upper(str) shaped(str) end -- 2.08 => 0.77
1432-- print(os.clock()-c,n*#str*3)
1433
1434-- maybe: (twice as fast when much ascii)
1435--
1436-- local tolower  = lpeg.patterns.tolower
1437-- local lower    = string.lower
1438--
1439-- local allascii = R("\000\127")^1 * P(-1)
1440--
1441-- function characters.checkedlower(str)
1442--     return lpegmatch(allascii,str) and lower(str) or lpegmatch(tolower,str) or str
1443-- end
1444
1445function characters.lettered(str,spacing)
1446    local new, n = { }, 0
1447    if spacing then
1448        local done = false
1449        for u in utfvalues(str) do
1450            local c = data[u].category
1451            if is_letter[c] then
1452                if done and n > 1 then
1453                    n = n + 1
1454                    new[n] = " "
1455                    done = false
1456                end
1457                n = n + 1
1458                new[n] = utfchar(u)
1459            elseif spacing and is_spacing[c] then
1460                done = true
1461            end
1462        end
1463    else
1464        for u in utfvalues(str) do
1465            if is_letter[data[u].category] then
1466                n = n + 1
1467                new[n] = utfchar(u)
1468            end
1469        end
1470    end
1471    return concat(new)
1472end
1473
1474-- Requesting lower and uppercase codes:
1475
1476function characters.uccode(n) return uccodes[n] end -- obsolete
1477function characters.lccode(n) return lccodes[n] end -- obsolete
1478
1479function characters.shape(n)
1480    local shcode = shcodes[n]
1481    if not shcode then
1482        return n, nil
1483    elseif type(shcode) == "table" then
1484        return shcode[1], shcode[#shcode]
1485    else
1486        return shcode, nil
1487    end
1488end
1489
1490-- -- some day we might go this route, but it does not really save that much
1491-- -- so not now (we can generate a lot using mtx-unicode that operates on the
1492-- -- database)
1493--
1494-- -- category cjkwd direction linebreak
1495--
1496-- -- adobename comment contextcommand contextname description fallback lccode
1497-- -- mathclass mathfiller mathname mathspec mathstretch mathsymbol mirror
1498-- -- range shcode specials uccode uccodes unicodeslot
1499--
1500-- local data = {
1501--     ['one']={
1502--         common = {
1503--             category="cc",
1504--             direction="bn",
1505--             linebreak="cm",
1506--         },
1507--         vector = {
1508--             [0x0000] = {
1509--                 description="NULL",
1510--                 group='one',
1511--                 unicodeslot=0x0000,
1512--             },
1513--             {
1514--                 description="START OF HEADING",
1515--                 group='one',
1516--                 unicodeslot=0x0001,
1517--             },
1518--         }
1519--     }
1520-- }
1521--
1522-- local chardata, groupdata = { }, { }
1523--
1524-- for group, gdata in next, data do
1525--     local common, vector = { __index = gdata.common }, gdata.vector
1526--     for character, cdata in next, vector do
1527--         chardata[character] = cdata
1528--         setmetatable(cdata,common)
1529--     end
1530--     groupdata[group] = gdata
1531-- end
1532
1533-- characters.data, characters.groups = chardata, groupdata
1534
1535--  [0xF0000]={
1536--   category="co",
1537--   cjkwd="a",
1538--   description="<Plane 0x000F Private Use, First>",
1539--   direction="l",
1540--   unicodeslot=0xF0000,
1541--  },
1542--  [0xFFFFD]={
1543--   category="co",
1544--   cjkwd="a",
1545--   description="<Plane 0x000F Private Use, Last>",
1546--   direction="l",
1547--   unicodeslot=0xFFFFD,
1548--  },
1549--  [0x100000]={
1550--   category="co",
1551--   cjkwd="a",
1552--   description="<Plane 0x0010 Private Use, First>",
1553--   direction="l",
1554--   unicodeslot=0x100000,
1555--  },
1556--  [0x10FFFD]={
1557--   category="co",
1558--   cjkwd="a",
1559--   description="<Plane 0x0010 Private Use, Last>",
1560--   direction="l",
1561--   unicodeslot=0x10FFFD,
1562--  },
1563
1564if not characters.superscripts then
1565
1566    local superscripts = allocate()   characters.superscripts = superscripts
1567    local subscripts   = allocate()   characters.subscripts   = subscripts
1568    local fractions    = allocate()   characters.fractions    = fractions
1569
1570    -- skipping U+02120 (service mark) U+02122 (trademark)
1571
1572    for k, v in next, data do
1573        local specials = v.specials
1574        if specials then
1575            local what = specials[1]
1576            if what == "super" then
1577                if #specials == 2 then
1578                    superscripts[k] = specials[2]
1579                elseif trace_defining then
1580                    report_defining("ignoring %s %a, char %c, description %a","superscript",ustring(k),k,v.description)
1581                end
1582            elseif what == "sub" then
1583                if #specials == 2 then
1584                    subscripts[k] = specials[2]
1585                elseif trace_defining then
1586                    report_defining("ignoring %s %a, char %c, description %a","subscript",ustring(k),k,v.description)
1587                end
1588            elseif what == "fraction" then
1589                if #specials > 1 then
1590                    fractions[k] = { unpack(specials,2) }
1591                elseif trace_defining then
1592                    report_defining("ignoring %s %a, char %c, description %a","fraction",ustring(k),k,v.description)
1593                end
1594            end
1595        end
1596    end
1597
1598 -- print(table.serialize(superscripts, "superscripts", { hexify = true }))
1599 -- print(table.serialize(subscripts,   "subscripts",   { hexify = true }))
1600 -- print(table.serialize(fractions,    "fractions",    { hexify = true }))
1601
1602    if storage then
1603        storage.register("characters/superscripts", superscripts, "characters.superscripts")
1604        storage.register("characters/subscripts",   subscripts,   "characters.subscripts")
1605        storage.register("characters/fractions",    fractions,    "characters.fractions")
1606    end
1607
1608end
1609
1610function characters.showstring(str)
1611    local list = utotable(str)
1612    for i=1,#list do
1613        report_defining("split % 3i : %C",i,list[i])
1614    end
1615end
1616
1617do
1618
1619    -- There is no need to preload this table.
1620
1621    local any       = P(1)
1622    local special   = S([['".,:;-+()]])
1623                    + P('“') + P('”')
1624    local apostrofe = P("’") + P("'")
1625
1626    local pattern = Cs ( (
1627        (P("medium light") / "medium-light" + P("medium dark")  / "medium-dark") * P(" skin tone")
1628        + (apostrofe * P("s"))/""
1629        + special/""
1630        + any
1631    )^1)
1632
1633    local function load()
1634        local name = resolvers.findfile("char-emj.lua")
1635        local data = name and name ~= "" and dofile(name) or { }
1636        local hash = { }
1637        for d, c in next, data do
1638            local k = lpegmatch(pattern,d) or d
1639            local u = { }
1640            for i=1,#c do
1641                u[i] = utfchar(c[i])
1642            end
1643            u = concat(u)
1644            hash[k] = u
1645        end
1646        return data, hash
1647    end
1648
1649    local data, hash = nil, nil
1650
1651    function characters.emojized(name)
1652        local t = lpegmatch(pattern,name)
1653        if t then
1654            return t
1655        else
1656            return { name }
1657        end
1658    end
1659
1660    local start     = P(" ")
1661    local finish    = P(-1) + P(" ")
1662    local skintone  = P("medium ")^0 * (P("light ") + P("dark "))^0 * P("skin tone")
1663    local gender    = P("woman") + P("man")
1664    local expanded  = (
1665                            P("m-l-")/"medium-light"
1666                          + P("m-d-")/"medium-dark"
1667                          + P("l-")  /"light"
1668                          + P("m-")  /"medium"
1669                          + P("d-")  /"dark"
1670                      )
1671                    * (P("s-t")/" skin tone")
1672    local compacted = (
1673                        (P("medium-")/"m-" * (P("light")/"l" + P("dark")/"d"))
1674                      + (P("medium")/"m"   +  P("light")/"l" + P("dark")/"d")
1675                      )
1676                    * (P(" skin tone")/"-s-t")
1677
1678    local pattern_0 = Cs((expanded + any)^1)
1679    local pattern_1 = Cs(((start * skintone + skintone * finish)/"" + any)^1)
1680    local pattern_2 = Cs(((start * gender   + gender   * finish)/"" + any)^1)
1681    local pattern_4 = Cs((compacted + any)^1)
1682
1683 -- print(lpegmatch(pattern_0,"kiss woman l-s-t man d-s-t"))
1684 -- print(lpegmatch(pattern_0,"something m-l-s-t"))
1685 -- print(lpegmatch(pattern_0,"something m-s-t"))
1686 -- print(lpegmatch(pattern_4,"something medium-light skin tone"))
1687 -- print(lpegmatch(pattern_4,"something medium skin tone"))
1688
1689    local skin =
1690        P("light skin tone")        / utfchar(0x1F3FB)
1691      + P("medium-light skin tone") / utfchar(0x1F3FC)
1692      + P("medium skin tone")       / utfchar(0x1F3FD)
1693      + P("medium-dark skin tone")  / utfchar(0x1F3FE)
1694      + P("dark skin tone")         / utfchar(0x1F3FF)
1695
1696    local parent =
1697        P("man")   / utfchar(0x1F468)
1698      + P("woman") / utfchar(0x1F469)
1699
1700    local child =
1701        P("baby")  / utfchar(0x1F476)
1702      + P("boy")   / utfchar(0x1F466)
1703      + P("girl")  / utfchar(0x1F467)
1704
1705    local zwj   = utfchar(0x200D)
1706    local heart = utfchar(0x2764) .. utfchar(0xFE0F) .. zwj
1707    local kiss  = utfchar(0x2764) .. utfchar(0xFE0F) .. utfchar(0x200D) .. utfchar(0x1F48B) .. zwj
1708
1709    ----- member = parent + child
1710
1711    local space = P(" ")
1712    local final = P(-1)
1713
1714    local p_done   = (space^1/zwj) + P(-1)
1715    local p_rest   = space/"" * (skin * p_done) + p_done
1716    local p_parent = parent * p_rest
1717    local p_child  = child  * p_rest
1718
1719    local p_family = Cs ( (P("family")            * space^1)/"" * p_parent^-2 * p_child^-2 )
1720    local p_couple = Cs ( (P("couple with heart") * space^1)/"" * p_parent * Cc(heart) * p_parent )
1721    local p_kiss   = Cs ( (P("kiss")              * space^1)/"" * p_parent * Cc(kiss)  * p_parent )
1722
1723    local p_special = p_family + p_couple + p_kiss
1724
1725 -- print(lpeg.match(p_special,"family man woman girl"))
1726 -- print(lpeg.match(p_special,"family man dark skin tone woman girl girl"))
1727
1728 -- local p_special = P { "all",
1729 --     all    = Cs (V("family") + V("couple") + V("kiss")),
1730 --     family = C("family")            * space^1 * V("parent")^-2 * V("child")^-2,
1731 --     couple = P("couple with heart") * space^1 * V("parent") * Cc(heart) * V("parent"),
1732 --     kiss   = P("kiss")              * space^1 * V("parent") * Cc(kiss) * V("parent"),
1733 --     parent = parent * V("rest"),
1734 --     child  = child  * V("rest"),
1735 --     rest   = (space * skin)^0/"" * ((space^1/zwj) + P(-1)),
1736 -- }
1737
1738    local emoji      = { }
1739    characters.emoji = emoji
1740
1741local cache = setmetatable({ }, { __mode = "k" } )
1742
1743    function emoji.resolve(name)
1744        if not hash then
1745            data, hash = load()
1746        end
1747        local h = hash[name]
1748        if h then
1749            return h
1750        end
1751        local h = cache[name]
1752        if h then
1753            return h
1754        elseif h == false then
1755            return
1756        end
1757        -- expand shortcuts
1758        local name = lpegmatch(pattern_0,name) or name
1759        -- expand some 25K variants
1760        local h = lpegmatch(p_special,name)
1761        if h then
1762            cache[name] = h
1763            return h
1764        end
1765        -- simplify
1766        local s = lpegmatch(pattern_1,name)
1767        local h = hash[s]
1768        if h then
1769            cache[name] = h
1770            return h
1771        end
1772        -- simplify
1773        local s = lpegmatch(pattern_2,name)
1774        local h = hash[s]
1775        if h then
1776            cache[name] = h
1777            return h
1778        end
1779        cache[name] = false
1780    end
1781
1782    function emoji.known()
1783        if not hash then
1784            data, hash = load()
1785        end
1786        return hash, data
1787    end
1788
1789    function emoji.compact(name)
1790        return lpegmatch(pattern_4,name) or name
1791    end
1792
1793end
1794
1795-- code moved to char-tex.lua
1796
1797return characters
1798
Source Browser ?