char-tex.lmt /size: 31 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['char-tex'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local lpeg = lpeg
10local tonumber, next, type = tonumber, next, type
11local format, find, gmatch, match, gsub = string.format, string.find, string.gmatch, string.match, string.gsub
12local utfchar, utfbyte = utf.char, utf.byte
13local concat, tohash = table.concat, table.tohash
14local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
15
16local lpegpatterns          = lpeg.patterns
17local lpegmatch             = lpeg.match
18local utfchartabletopattern = lpeg.utfchartabletopattern
19
20local allocate              = utilities.storage.allocate
21local mark                  = utilities.storage.mark
22
23local context               = context
24local commands              = commands
25
26if not characters then require("char-ini") require("char-utf")  end
27
28local characters            = characters
29local texcharacters         = { }
30characters.tex              = texcharacters
31local utffilters            = characters.filters.utf
32
33local allocate              = utilities.storage.allocate or function() return { } end
34local mark                  = utilities.storage.mark     or allocate
35
36local is_character          = characters.is_character
37local is_letter             = characters.is_letter
38local is_command            = characters.is_command
39local is_spacing            = characters.is_spacing
40local is_mark               = characters.is_mark
41
42local data                  = characters.data  if not data then return end
43local blocks                = characters.blocks
44
45local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
46
47local report_defining       = logs.reporter("characters")
48
49-- In order to deal with 8-bit output, we need to find a way to go from UTF to
50-- 8-bit. This is handled in the 32 bit engine itself. This leaves us problems with
51-- characters that are specific to TeX, like curly braces and dollars. We can remap
52-- some chars that tex input files are sensitive for to a private area (while
53-- writing to a utility file) and revert then to their original slot when we read in
54-- such a file. Instead of reverting, we can (when we resolve characters to glyphs)
55-- map them to their right glyph there. For this purpose we can use the private
56-- planes 0x0F0000 and 0x100000.
57
58local low     = allocate()
59local high    = allocate()
60local escapes = allocate()
61local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
62
63local private = {
64    low     = low,
65    high    = high,
66    escapes = escapes,
67}
68
69utffilters.private = private
70
71for ch in gmatch(special,".") do
72    local cb
73    if type(ch) == "number" then
74        cb, ch = ch, utfchar(ch)
75    else
76        cb = utfbyte(ch)
77    end
78    if cb < 256 then
79        escapes[ch] = "\\" .. ch
80        low[ch] = utfchar(0x0F0000 + cb)
81        if ch == "%" then
82            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
83        end
84        high[utfchar(0x0F0000 + cb)] = ch
85    end
86end
87
88local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
89local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
90
91lpegpatterns.utftohigh = tohigh
92lpegpatterns.utftolow  = tolow
93
94function utffilters.harden(str)
95    return lpegmatch(tohigh,str)
96end
97
98function utffilters.soften(str)
99    return lpegmatch(tolow,str)
100end
101
102private.escape  = utf.remapper(escapes) -- maybe: ,"dynamic"
103private.replace = utf.remapper(low)     -- maybe: ,"dynamic"
104private.revert  = utf.remapper(high)    -- maybe: ,"dynamic"
105
106local accentmapping = allocate {
107    ['"'] = { [""] = "¨",
108        A = "Ä", a = "ä",
109        E = "Ë", e = "ë",
110        I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
111        O = "Ö", o = "ö",
112        U = "Ü", u = "ü",
113        Y = "Ÿ", y = "ÿ",
114    },
115    ["'"] = { [""] = "´",
116        A = "Á", a = "á",
117        C = "Ć", c = "ć",
118        E = "É", e = "é",
119        I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
120        L = "Ĺ", l = "ĺ",
121        N = "Ń", n = "ń",
122        O = "Ó", o = "ó",
123        R = "Ŕ", r = "ŕ",
124        S = "Ś", s = "ś",
125        U = "Ú", u = "ú",
126        Y = "Ý", y = "ý",
127        Z = "Ź", z = "ź",
128    },
129    ["."] = { [""] = "˙",
130        C = "Ċ", c = "ċ",
131        E = "Ė", e = "ė",
132        G = "Ġ", g = "ġ",
133        I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
134        Z = "Ż", z = "ż",
135    },
136    ["="] = { [""] = "¯",
137        A = "Ā", a = "ā",
138        E = "Ē", e = "ē",
139        I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
140        O = "Ō", o = "ō",
141        U = "Ū", u = "ū",
142    },
143    ["H"] = { [""] = "˝",
144        O = "Ő", o = "ő",
145        U = "Ű", u = "ű",
146    },
147    ["^"] = { [""] = "ˆ",
148        A = "Â", a = "â",
149        C = "Ĉ", c = "ĉ",
150        E = "Ê", e = "ê",
151        G = "Ĝ", g = "ĝ",
152        H = "Ĥ", h = "ĥ",
153        I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
154        J = "Ĵ", j = "ĵ",
155        O = "Ô", o = "ô",
156        S = "Ŝ", s = "ŝ",
157        U = "Û", u = "û",
158        W = "Ŵ", w = "ŵ",
159        Y = "Ŷ", y = "ŷ",
160    },
161    ["`"] = { [""] = "`",
162        A = "À", a = "à",
163        E = "È", e = "è",
164        I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
165        O = "Ò", o = "ò",
166        U = "Ù", u = "ù",
167        Y = "", y = "",
168    },
169    ["c"] = { [""] = "¸",
170        C = "Ç", c = "ç",
171        K = "Ķ", k = "ķ",
172        L = "Ļ", l = "ļ",
173        N = "Ņ", n = "ņ",
174        R = "Ŗ", r = "ŗ",
175        S = "Ş", s = "ş",
176        T = "Ţ", t = "ţ",
177    },
178    ["k"] = { [""] = "˛",
179        A = "Ą", a = "ą",
180        E = "Ę", e = "ę",
181        I = "Į", i = "į",
182        U = "Ų", u = "ų",
183    },
184    ["r"] = { [""] = "˚",
185        A = "Å", a = "å",
186        U = "Ů", u = "ů",
187    },
188    ["u"] = { [""] = "˘",
189        A = "Ă", a = "ă",
190        E = "Ĕ", e = "ĕ",
191        G = "Ğ", g = "ğ",
192        I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
193        O = "Ŏ", o = "ŏ",
194        U = "Ŭ", u = "ŭ",
195        },
196    ["v"] = { [""] = "ˇ",
197        C = "Č", c = "č",
198        D = "Ď", d = "ď",
199        E = "Ě", e = "ě",
200        L = "Ľ", l = "ľ",
201        N = "Ň", n = "ň",
202        R = "Ř", r = "ř",
203        S = "Š", s = "š",
204        T = "Ť", t = "ť",
205        Z = "Ž", z = "ž",
206        },
207    ["~"] = { [""] = "˜",
208        A = "Ã", a = "ã",
209        I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
210        N = "Ñ", n = "ñ",
211        O = "Õ", o = "õ",
212        U = "Ũ", u = "ũ",
213    },
214}
215
216texcharacters.accentmapping = accentmapping
217
218local accent_map = allocate { -- incomplete
219   ['~'] = "̃" , --  ̃ Ẽ
220   ['"'] = "̈" , --  ̈ Ë
221   ["`"] = "̀" , --  ̀ È
222   ["'"] = "́" , --  ́ É
223   ["^"] = "̂" , --  ̂ Ê
224    --  ̄ Ē
225    --  ̆ Ĕ
226    --  ̇ Ė
227    --  ̉ Ẻ
228    --  ̌ Ě
229    --  ̏ Ȅ
230    --  ̑ Ȇ
231    --  ̣ Ẹ
232    --  ̧ Ȩ
233    --  ̨ Ę
234    --  ̭ Ḙ
235    --  ̰ Ḛ
236}
237
238-- local accents = concat(table.keys(accentmapping)) -- was _map
239
240local function remap_accent(a,c,braced)
241    local m = accentmapping[a]
242    if m then
243        local n = m[c]
244        if n then
245            return n
246        end
247    end
248--     local m = accent_map[a]
249--     if m then
250--         return c .. m
251--     elseif braced then -- or #c > 0
252    if braced then -- or #c > 0
253        return "\\" .. a .. "{" .. c .. "}"
254    else
255        return "\\" .. a .. " " .. c
256    end
257end
258
259local commandmapping = allocate {
260    ["aa"] = "å", ["AA"] = "",
261    ["ae"] = "æ", ["AE"] = "Æ",
262    ["cc"] = "ç", ["CC"] = "Ç",
263    ["i"]  = "ı", ["j"]  = "ȷ",
264    ["ij"] = "ij", ["IJ"] = "IJ",
265    ["l"]  = "ł", ["L"]  = "Ł",
266    ["o"]  = "ø", ["O"]  = "Ø",
267    ["oe"] = "œ", ["OE"] = "Œ",
268    ["sz"] = "ß", ["SZ"] = "SZ", ["ss"] = "ß", ["SS"] = "ß", -- uppercase: ẞ
269}
270
271texcharacters.commandmapping = commandmapping
272
273-- local ligaturemapping = allocate {
274--     ["''"]  = "”",
275--     ["``"]  = "“",
276--     ["--"]  = "–",
277--     ["---"] = "—",
278-- }
279
280-- Older accent handling code can be found in char-def.lua but in the meantime
281-- we moved on. First the one with commands:
282
283local untex, pattern
284
285local function toutfpattern()
286    if not untex then
287        local hash = { }
288        for k, v in next, accentmapping do
289            for kk, vv in next, v do
290                if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
291                    hash[ "\\"..k.." "..kk     ] = vv
292                    hash["{\\"..k.." "..kk.."}"] = vv
293                else
294                    hash["\\" ..k     ..kk     ] = vv
295                    hash["{\\"..k     ..kk.."}"] = vv
296                end
297                hash["\\" ..k.."{"..kk.."}" ] = vv
298                hash["{\\"..k.."{"..kk.."}}"] = vv
299            end
300        end
301        for k, v in next, commandmapping do
302            hash["\\"..k.." "] = v
303            hash["{\\"..k.."}"] = v
304            hash["{\\"..k.." }"] = v
305        end
306     -- for k, v in next, ligaturemapping do
307     --     hash[k] = v
308     -- end
309        untex = utfchartabletopattern(hash) / hash
310    end
311    return untex
312end
313
314local function prepare()
315    pattern = Cs((toutfpattern() + P(1))^0)
316    return pattern
317end
318
319local function textoutf(str,strip)
320    if str == "" then
321        return str
322    elseif not find(str,"\\",1,true) then
323        return str
324 -- elseif strip then
325    else
326        return lpegmatch(pattern or prepare(),str)
327    end
328end
329
330texcharacters.toutfpattern = toutfpattern
331texcharacters.toutf        = textoutf
332
333-- print(texcharacters.toutf([[\~{Z}]],true))
334-- print(texcharacters.toutf([[\'\i]],true))
335-- print(texcharacters.toutf([[\'{\i}]],true))
336-- print(texcharacters.toutf([[\"{e}]],true))
337-- print(texcharacters.toutf([[\" {e}]],true))
338-- print(texcharacters.toutf([[{\"{e}}]],true))
339-- print(texcharacters.toutf([[{\" {e}}]],true))
340-- print(texcharacters.toutf([[{\l}]],true))
341-- print(texcharacters.toutf([[{\l }]],true))
342-- print(texcharacters.toutf([[\v{r}]],true))
343-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
344-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
345
346-- Next the ones without backslash
347
348local untex, pattern
349
350local function toutfpattern()
351    if not untex then
352        local hash = { }
353        for k, v in next, accentmapping do
354            for kk, vv in next, v do
355                hash[k..kk] = vv
356            end
357        end
358        for k, v in next, commandmapping do
359            hash[k] = v
360        end
361     -- for k, v in next, ligaturemapping do
362     --     hash[k] = v
363     -- end
364        untex = utfchartabletopattern(hash) / hash
365    end
366    return untex
367end
368
369local function prepare()
370    pattern = Cs((toutfpattern() + P(1))^0)
371    return pattern
372end
373
374local function textoutf(str)
375    return lpegmatch(pattern or prepare(),str)
376end
377
378texcharacters.strtoutfpattern = toutfpattern
379texcharacters.strtextoutf     = textoutf
380
381local collapse = utffilters.collapse
382local combine  = utffilters.combine
383
384if not interfaces then return end
385
386local implement = interfaces.implement
387
388local pattern1, pattern2
389
390local verbosemarks = characters.verbosemarks
391
392if verbosemarks then
393
394    mark(verbosemarks)
395
396else
397
398    verbosemarks = allocate {
399        ["stroke"]               = utfchar(0x02F), ["slash"]        = utfchar(0x02F),
400        ["middle dot"]           = utfchar(0x0B7),
401
402        ["grave"]                = utfchar(0x300),
403        ["acute"]                = utfchar(0x301),
404        ["circumflex"]           = utfchar(0x302),
405        ["circumflex"]           = utfchar(0x302),
406        ["tilde"]                = utfchar(0x303),
407        ["macron"]               = utfchar(0x304), ["line"]         = utfchar(0x304),
408        ["overline"]             = utfchar(0x305),
409        ["breve"]                = utfchar(0x306),
410        ["dot"]                  = utfchar(0x307),
411        ["dieresis"]             = utfchar(0x308), ["diaeresis"]    = utfchar(0x308),
412        ["hook"]                 = utfchar(0x309),
413        ["ring"]                 = utfchar(0x30A),
414        ["double acute"]         = utfchar(0x30B), ["hungarumlaut"] = utfchar(0x30B), -- tex speak
415        ["caron"]                = utfchar(0x30C),
416        ["vertical line"]        = utfchar(0x30D),
417        ["double vertical line"] = utfchar(0x30E),
418        ["double grave"]         = utfchar(0x30F),
419        ["inverted breve"]       = utfchar(0x311),
420        ["dot below"]            = utfchar(0x323),
421        ["ring below"]           = utfchar(0x325),
422        ["cedilla"]              = utfchar(0x327), ["comma below"]  = utfchar(0x327),
423        ["ogonek"]               = utfchar(0x328),
424        ["caron below"]          = utfchar(0x32C),
425        ["circumflex below"]     = utfchar(0x32D),
426        ["tilde below"]          = utfchar(0x330),
427        ["macron below"]         = utfchar(0x331), ["line below"]   = utfchar(0x331),
428
429        ["hook below"]           = utfchar(0x1FA9D),
430
431    }
432
433    characters.verbosemarks = verbosemarks
434
435    if storage then
436        storage.register("characters/verbosemarks", verbosemarks, "characters.verbosemarks")
437    end
438
439end
440
441local function prepare1()
442    pattern1 = Cs(
443        (
444P("\\")/"" * (utfchartabletopattern(commandmapping) / commandmapping) * (P(" ")/"")
445+            utfchartabletopattern(verbosemarks) / verbosemarks
446          + lpegpatterns.space/""
447          + lpegpatterns.utf8character
448        )^0
449    )
450    return pattern1
451end
452
453local function prepare2()
454    local back = {
455      ["ı"] = "i",
456      ["ȷ"] = "j",
457    }
458    pattern2 = Cs(
459        (
460            utfchartabletopattern(back) / back
461          + lpegpatterns.utf8character
462        )^0
463    )
464    return pattern2
465end
466
467local hash = table.setmetatableindex(function(t,k)
468    local f = k
469    k = lpegmatch(pattern1 or prepare1(),k) or k
470    k = lpegmatch(pattern2 or prepare2(),k) or k
471    local v = collapse(k) or k -- char specials
472    if k ~= v then
473        goto DONE
474    end
475    v = combine(k) or k -- with specials
476    if k ~= v then
477        goto DONE
478    end
479    v = commandmapping[k] or k
480    if k ~= v then
481        f = "\\" .. f
482        goto DONE
483    end
484    v = textoutf(k) or k
485    if k ~= v then
486        f = "\\" .. f
487        goto DONE
488    end
489  ::DONE::
490    report_defining("instead of old school '%s' you can input the utf sequence %s",f,v)
491    t[k] = v
492    return v
493end)
494
495implement {
496    name      = "chr",
497 -- arguments = "argument", -- not here
498    arguments = "string",
499    public    = true,
500    actions   = function(str)
501        local hsh = hash[str]
502        context(hsh) -- expandable
503    end
504}
505
506function texcharacters.safechar(n) -- was characters.safechar
507    local c = data[n]
508    if c and c.contextname then
509        return "\\" .. c.contextname
510    else
511        return utfchar(n)
512    end
513end
514
515if not context or not commands then
516    -- used in e.g. mtx-bibtex
517    return
518end
519
520-- all kind of initializations
521
522local tex           = tex
523local texsetlccode  = tex.setlccode
524local texsetsfcode  = tex.setsfcode
525local texsetcatcode = tex.setcatcode
526
527local contextsprint = context.sprint
528local ctxcatcodes   = catcodes.numbers.ctxcatcodes
529
530local texsetmacro   = tokens.setters.macro
531local texsetchar    = tokens.setters.char
532
533-- function texcharacters.defineaccents()
534--     local ctx_dodefineaccentcommand  = context.dodefineaccent
535--     local ctx_dodefineaccent         = context.dodefineaccent
536--     local ctx_dodefinecommand        = context.dodefinecommand
537--     for accent, group in next, accentmapping do
538--         ctx_dodefineaccentcommand(accent)
539--         for character, mapping in next, group do
540--             ctx_dodefineaccent(accent,character,mapping)
541--         end
542--     end
543--     for command, mapping in next, commandmapping do
544--         ctx_dodefinecommand(command,mapping)
545--     end
546--     os.exit()
547-- end
548
549function texcharacters.defineaccents()
550    local ctx_dodefinecombine = context.dodefinecombine
551    local ctx_dodefinecommand = context.dodefinecommand
552    for verbose, mark in next, verbosemarks do
553        ctx_dodefinecombine((gsub(verbose," ","")),mark)
554    end
555    for command, mapping in next, commandmapping do
556        ctx_dodefinecommand(command,mapping)
557    end
558end
559
560implement { -- a waste of scanner but consistent
561    name    = "defineaccents",
562    actions = texcharacters.defineaccents
563}
564
565-- Instead of using a TeX file to define the named glyphs, we use the table. After
566-- all, we have this information available anyway.
567
568local function to_number(s)
569    local n = tonumber(s)
570    if n then
571        return n
572    end
573    return tonumber(match(s,'^"(.*)$'),16) or 0
574end
575
576implement {
577    name      = "utfchar",
578    actions   = { to_number, utfchar, contextsprint },
579    arguments = "string"
580}
581
582implement {
583    name      = "safechar",
584    actions   = { to_number, texcharacters.safechar, contextsprint },
585    arguments = "string"
586}
587
588implement {
589    name      = "uchar",
590    arguments = "2 integers",
591    actions   = function(h,l)
592        context(utfchar(h*256+l))
593    end
594}
595
596tex.uprint = commands.utfchar
597
598-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
599-- so at some point we might drop this
600
601-- The following get set at the TeX end:
602
603local forbidden = tohash {
604    0x000A0, -- zs nobreakspace            <self>
605    0x000AD, -- cf softhyphen              <self>
606 -- 0x00600, -- cf arabicnumber            <self>
607 -- 0x00601, -- cf arabicsanah             <self>
608 -- 0x00602, -- cf arabicfootnotemarker    <self>
609 -- 0x00603, -- cf arabicsafha             <self>
610 -- 0x00604, -- cf arabicsamvat            <self>
611 -- 0x00605, -- cf arabicnumberabove       <self>
612 -- 0x0061C, -- cf arabiclettermark        <self>
613 -- 0x006DD, -- cf arabicendofayah         <self>
614 -- 0x008E2, -- cf arabicdisputedendofayah <self>
615    0x02000, -- zs enquad                  <self>
616    0x02001, -- zs emquad                  <self>
617    0x02002, -- zs enspace                 \kern .5\emwidth
618    0x02003, -- zs emspace                 \hskip \emwidth
619    0x02004, -- zs threeperemspace         <self>
620    0x02005, -- zs fourperemspace          <self>
621    0x02006, -- zs sixperemspace           <self>
622    0x02007, -- zs figurespace             <self>
623    0x02008, -- zs punctuationspace        <self>
624    0x02009, -- zs breakablethinspace      <self>
625    0x0200A, -- zs hairspace               <self>
626    0x0200B, -- cf zerowidthspace          <self>
627    0x0200C, -- cf zwnj                    <self>
628    0x0200D, -- cf zwj                     <self>
629    0x0202F, -- zs narrownobreakspace      <self>
630    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
631 -- 0x03000, -- zs ideographicspace        <self>
632 -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
633}
634
635local csletters = characters.csletters -- also a signal that we have initialized
636local activated = { }
637local sfmode    = "unset" -- unset, traditional, normal
638local block_too = false
639
640directives.register("characters.blockstoo",function(v) block_too = v end)
641
642-- If this is something that is not documentwide and used a lot, then we
643-- need a more clever approach (trivial but not now).
644
645local function setuppersfcodes(v,n)
646    if sfstate ~= "unset" then
647        report_defining("setting uppercase sf codes to %a",n)
648        for u, chr in next, data do
649            if chr.category == "lu" then
650                texsetsfcode(u,n)
651            end
652        end
653    end
654    sfstate = v
655end
656
657directives.register("characters.spaceafteruppercase",function(v)
658    if v == "traditional" then
659        setuppersfcodes(v,999)
660    elseif v == "normal" then
661        setuppersfcodes(v,1000)
662    end
663end)
664
665if not csletters then
666
667    csletters            = allocate()
668    characters.csletters = csletters
669
670    report_defining("setting up character related codes and commands")
671
672    if sfstate == "unset" then
673        sfstate = "traditional"
674    end
675
676    local traditional = sfstate == "traditional"
677
678    for u, chr in next, data do -- will move up
679        local contextname = chr.contextname
680        local category    = chr.category
681        local isletter    = is_letter[category]
682        if contextname then
683            if is_character[category] then
684                if chr.unicodeslot < 128 then
685                    if isletter then
686                        local c = utfchar(u)
687                        csletters[c] = u
688                    end
689                else
690                    local c = utfchar(u)
691                    if isletter and u >= 32 and u <= 65536 then
692                        csletters[c] = u
693                    end
694                end
695                if isletter then
696                    local lc = chr.lccode
697                    local uc = chr.uccode
698                    if not lc then
699                        chr.lccode = u
700                        lc = u
701                    elseif type(lc) == "table" then
702                        lc = u
703                    end
704                    if not uc then
705                        chr.uccode = u
706                        uc = u
707                    elseif type(uc) == "table" then
708                        uc = u
709                    end
710                    texsetlccode(u,lc,uc)
711                    if traditional and category == "lu" then
712                        texsetsfcode(code,999)
713                    end
714                end
715            elseif is_command[category] and not forbidden[u] then
716                -- skip
717            elseif is_mark[category] then
718                texsetlccode(u,u,u) -- for hyphenation
719            end
720        elseif isletter then
721            csletters[utfchar(u)] = u
722            local lc, uc = chr.lccode, chr.uccode
723            if not lc then
724                chr.lccode = u
725                lc = u
726            elseif type(lc) == "table" then
727                lc = u
728            end
729            if not uc then
730                chr.uccode = u
731                uc = u
732            elseif type(uc) == "table" then
733                uc = u
734            end
735            texsetlccode(u,lc,uc)
736            if traditional and category == "lu" then
737                texsetsfcode(code,999)
738            end
739        elseif is_mark[category] then
740            texsetlccode(u,u,u) -- for hyphenation
741        end
742    end
743
744    if blocks_too then
745        -- this slows down format generation by over 10 percent
746        for k, v in next, blocks do
747            if v.catcode == "letter" then
748                local first = v.first
749                local last  = v.last
750                local gaps  = v.gaps
751                if first and last then
752                    for u=first,last do
753                        csletters[utfchar(u)] = u
754                        --
755                     -- texsetlccode(u,u,u) -- self self
756                        --
757                    end
758                end
759                if gaps then
760                    for i=1,#gaps do
761                        local u = gaps[i]
762                        csletters[utfchar(u)] = u
763                        --
764                     -- texsetlccode(u,u,u) -- self self
765                        --
766                    end
767                end
768            end
769        end
770    end
771
772    if storage then
773        storage.register("characters/csletters", csletters, "characters.csletters")
774    end
775
776 -- These can go now:
777 --
778 -- .acute .apostrophe .bar .breve .breveacute .brevedotbelow .brevegrave .brevehook
779 -- .brevetilde .caron .cedilla .circumflex .circumflexacute .circumflexdotbelow
780 -- .circumflexgrave .circumflexhook .circumflextilde .commaaccent .curl .diaeresis
781 -- .diaeresisacute .diaeresiscaron .diaeresisgrave .diaeresismacron .dotaccent
782 -- .dotaccentmacron .dotbelow .dotmiddle .doublegrave .grave .horn .hornacute
783 -- .horndotbelow .horngrave .hornhook .horntilde .hungarumlaut .invertedbreve
784 -- .macron .ogonek .ogonekmacron .ring .ringacute .sharp .stroke .strokeacute .tail
785 -- .tilde .tildemacron
786
787    local pattern = P("greek") + P("arabic") + P("hebrew") + P("cyrillic") + P("roman")
788
789 -- Only keep these:
790 --
791 -- copyright dotlessI dotlessi dotlessJ dotlessj emdash endash exclamdown figuredash
792 -- guilsingleleft guilsingleright periodcentered perthousand questiondown quotedbl
793 -- quotedblbase quotedblleft quotedblright quoteleft quoteright quotesingle
794 -- quotesinglebase registered rightguillemot sectionmark textacute textampersand
795 -- textAngstrom textasciicircum textasciitilde textat textbackslash textbar
796 -- textbottomcomma textbottomdot textbraceleft textbraceright textbreve
797 -- textbrokenbar textbullet textcaron textcedilla textcelsius textcent textcircledP
798 -- textcircumflex textcomma textcontrolspace textcurrency textdag textddag
799 -- textdegree textdiaeresis textdiv textdollar textdong textdotaccent textellipsis
800 -- texteuro textfraction textgrave texthash texthorizontalbar texthungarumlaut
801 -- texthyphen textkelvin textlognot textmacron textmho textminus textmp textmu
802 -- textmultiply textnumero textogonek textohm textounce textpercent textperiod
803 -- textpm textring textslash textsterling texttilde textunderscore textyen
804
805    local function setname(category,chr,u,contextname)
806        if lpegmatch(pattern,contextname) then
807         -- print("ignored",contextname)
808        elseif is_character[category] then
809            if chr.unicodeslot < 128 then
810                if is_letter[category] then
811                    texsetmacro(contextname,utfchar(u),"immutable")
812                else
813                    texsetchar(contextname,u,"immutable")
814                end
815            else
816                texsetmacro(contextname,utfchar(u),"immutable")
817            end
818        elseif is_command[category] and not forbidden[u] then
819            texsetmacro(contextname,utfchar(u),"immutable")
820        end
821    end
822
823    function characters.setcharacternames(ctt)
824        for u, chr in next, data do -- will move up
825            local contextname = chr.contextname
826            local contextspec = chr.contextspec
827            local category    = chr.category
828            if contextname then
829                setname(category,chr,u,contextname)
830            end
831            if contextspec then
832                for i=1,#contextspec do
833                    local extraname = contextspec[i]
834                    if extraname ~= contextname then
835                        setname(category,chr,u,extraname)
836                    end
837                end
838            end
839        end
840    end
841
842else
843    mark(csletters)
844end
845
846lpegpatterns.csletter = utfchartabletopattern(csletters)
847
848-- The engine presets the letters to 11 (always).
849
850function characters.setlettercatcodes(cct)
851    if trace_defining then
852        report_defining("assigning letter catcodes to catcode table %a",cct)
853    end
854    local saved = tex.catcodetable
855    tex.catcodetable = cct
856    texsetcatcode(0x200C,11) -- non-joiner
857    texsetcatcode(0x200D,11) -- joiner
858    for c, u in next, csletters do
859        texsetcatcode(u,11)
860    end
861    tex.catcodetable = saved
862end
863
864function characters.setothercatcodes(cct)
865    if trace_defining then
866        report_defining("assigning other catcodes to catcode table %a",cct)
867    end
868    local saved = tex.catcodetable
869    tex.catcodetable = cct
870    for u=65,90 do
871        texsetcatcode(u,12)
872    end
873    for u=97,122 do
874        texsetcatcode(u,12)
875    end
876    tex.catcodetable = saved
877end
878
879function characters.setactivecatcodes(cct)
880    local saved = tex.catcodetable
881    tex.catcodetable = cct
882    for i=1,#activated do
883        local u = activated[i]
884        texsetcatcode(u,13)
885        if trace_defining then
886            report_defining("character %U (%s) is active in set %a",u,data[u].description,cct)
887        end
888    end
889    tex.catcodetable = saved
890end
891
892implement {
893    name      = "chardescription",
894    arguments = "integer",
895    actions   = function(slot)
896        local d = data[slot]
897        if d then
898            context(d.description)
899        end
900    end,
901}
902
903if characters.setcharacternames then -- only in ini mode
904
905    implement { name = "setlettercatcodes", scope = "private", actions = characters.setlettercatcodes, arguments = "integer" }
906    implement { name = "setothercatcodes",  scope = "private", actions = characters.setothercatcodes,  arguments = "integer" }
907    implement { name = "setactivecatcodes", scope = "private", actions = characters.setactivecatcodes, arguments = "integer" }
908    implement { name = "setcharacternames", scope = "private", actions = characters.setcharacternames, arguments = "integer" }
909
910end
911
912-- experiment (some can move to char-ini.lua)
913
914local function overload(c,u,code,codes)
915    local c = tonumber(c)
916    if not c then
917        return
918    end
919    local u = utilities.parsers.settings_to_array(u)
920    local n = #u
921    if n == 0 then
922        return
923    end
924    local t = nil
925    if n == 1 then
926        t = tonumber(u[1])
927    else
928        t = { }
929        for i=1,n do
930            t[#t+1] = tonumber(u[i])
931        end
932    end
933    if t then
934        data[c][code] = t
935        characters[codes][c] = nil
936    end
937end
938
939implement {
940    name      = "overloaduppercase",
941    arguments = "2 strings",
942    actions   = function(c,u)
943        overload(c,u,"uccode","uccodes")
944    end
945}
946
947implement {
948    name      = "overloadlowercase",
949    arguments = "2 strings",
950    actions   = function(c,u)
951        overload(c,u,"lccode","lccodes")
952    end
953}
954
955-- Just for fun we support keywords:
956--
957-- \startTEXpage[offset=10pt]
958--     abg"
959--     \sl \showboxes
960--     \accent               `" h%
961--     \accent               `" x%
962--     \accent yoffset  .2ex `" x
963--     \accent yoffset 1.1ex `x x%
964-- \stopTEXpage
965--
966-- We could do this:
967--
968-- \startTEXpage[offset=10pt]
969--     abg"
970--     \sl \showboxes
971--     \withaccent               `" h%
972--     \withaccent               `" x%
973--     \withaccent yoffset  .2ex `" x
974--     \withaccent yoffset 1.1ex accent `x base `x%
975-- \stopTEXpage
976--
977-- But only when users demand it:
978--
979-- do
980--
981--     local new_glyph = nodes.pool.glyph
982--
983--     local scankeyword   = tokens.scanners.keyword
984--     local scaninteger   = tokens.scanners.integer
985--     local scandimension = tokens.scanners.dimension
986--     local scantoken     = tokens.scanners.token
987--
988--     implement {
989--         name      = "withaccent",
990--         public    = true,
991--         protected = true,
992--         actions   = function()
993--             local xoffset = 0
994--             local yoffset = 0
995--             local accent  = false
996--             local base    = false
997--             local zwj     = 0x200D
998--             while true do
999--                 if scankeyword("xoffset") then
1000--                     xoffset = scandimension()
1001--                 elseif scankeyword("yoffset") then
1002--                     yoffset = scandimension()
1003--                 elseif scankeyword("accent") then
1004--                     accent = scaninteger()
1005--                 elseif scankeyword("base") then
1006--                     base = scaninteger()
1007--                 else
1008--                     break
1009--                 end
1010--             end
1011--             if not accent then
1012--                 accent = scaninteger()
1013--             end
1014--             if not base then
1015--                 local nxttok = scantoken()
1016--                 base = nxttok.cmdname == "char_number" and scaninteger() or nxttok.index
1017--             end
1018--             if base and accent and base > 0 and accent > 0 then
1019--                 base   = new_glyph(true,base)
1020--                 zwj    = new_glyph(true,zwj)
1021--                 accent = new_glyph(true,accent)
1022--                 local slant   = fonts.hashes.parameters[true].slant / 65536 -- a la tex
1023--                 local xheight = fonts.hashes.parameters[true].xheight -- hm, compensated for glyphscale?
1024--                 accent.xoffset = xoffset - .5*(base.width -accent.width) + .5*(base.height-accent.height) * slant
1025--                 accent.yoffset = yoffset - (xheight - accent.height)
1026--                 accent.left    = accent.width
1027--                 accent.options = accent.options | 0x40 | 0x80
1028--                 context.dontleavehmode()
1029--                 context(base)
1030--                 context(zwj)
1031--                 context(accent)
1032--             end
1033--         end,
1034--     }
1035--
1036-- end
1037