char-tex.lmt /size: 29 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['char-tex'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local lpeg = lpeg
10local tonumber, next, type = tonumber, next, type
11local format, find, gmatch, match, gsub = string.format, string.find, string.gmatch, string.match, string.gsub
12local utfchar, utfbyte = utf.char, utf.byte
13local concat, tohash = table.concat, table.tohash
14local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
15
16local lpegpatterns          = lpeg.patterns
17local lpegmatch             = lpeg.match
18local utfchartabletopattern = lpeg.utfchartabletopattern
19
20local allocate              = utilities.storage.allocate
21local mark                  = utilities.storage.mark
22
23local context               = context
24local commands              = commands
25
26local characters            = characters
27local texcharacters         = { }
28characters.tex              = texcharacters
29local utffilters            = characters.filters.utf
30
31local allocate              = utilities.storage.allocate or function() return { } end
32local mark                  = utilities.storage.mark     or allocate
33
34local is_character          = characters.is_character
35local is_letter             = characters.is_letter
36local is_command            = characters.is_command
37local is_spacing            = characters.is_spacing
38local is_mark               = characters.is_mark
39local is_punctuation        = characters.is_punctuation
40
41local data                  = characters.data  if not data then return end
42local blocks                = characters.blocks
43
44local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
45
46local report_defining       = logs.reporter("characters")
47
48--[[ldx--
49<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to
508-bit. This is handled in the <l n='luatex'/> engine itself.</p>
51
52<p>This leaves us problems with characters that are specific to <l n='tex'/> like
53<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files
54are sensitive for to a private area (while writing to a utility file) and revert then
55to their original slot when we read in such a file. Instead of reverting, we can (when
56we resolve characters to glyphs) map them to their right glyph there. For this purpose
57we can use the private planes 0x0F0000 and 0x100000.</p>
58--ldx]]--
59
60local low     = allocate()
61local high    = allocate()
62local escapes = allocate()
63local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
64
65local private = {
66    low     = low,
67    high    = high,
68    escapes = escapes,
69}
70
71utffilters.private = private
72
73for ch in gmatch(special,".") do
74    local cb
75    if type(ch) == "number" then
76        cb, ch = ch, utfchar(ch)
77    else
78        cb = utfbyte(ch)
79    end
80    if cb < 256 then
81        escapes[ch] = "\\" .. ch
82        low[ch] = utfchar(0x0F0000 + cb)
83        if ch == "%" then
84            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
85        end
86        high[utfchar(0x0F0000 + cb)] = ch
87    end
88end
89
90local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
91local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
92
93lpegpatterns.utftohigh = tohigh
94lpegpatterns.utftolow  = tolow
95
96function utffilters.harden(str)
97    return lpegmatch(tohigh,str)
98end
99
100function utffilters.soften(str)
101    return lpegmatch(tolow,str)
102end
103
104private.escape  = utf.remapper(escapes) -- maybe: ,"dynamic"
105private.replace = utf.remapper(low)     -- maybe: ,"dynamic"
106private.revert  = utf.remapper(high)    -- maybe: ,"dynamic"
107
108--[[ldx--
109<p>We get a more efficient variant of this when we integrate
110replacements in collapser. This more or less renders the previous
111private code redundant. The following code is equivalent but the
112first snippet uses the relocated dollars.</p>
113
114<typing>
115[󰀤x󰀤] [$x$]
116</typing>
117--ldx]]--
118
119-- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string
120-- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess
121-- with spaces
122
123local accentmapping = allocate {
124    ['"'] = { [""] = "¨",
125        A = "Ä", a = "ä",
126        E = "Ë", e = "ë",
127        I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
128        O = "Ö", o = "ö",
129        U = "Ü", u = "ü",
130        Y = "Ÿ", y = "ÿ",
131    },
132    ["'"] = { [""] = "´",
133        A = "Á", a = "á",
134        C = "Ć", c = "ć",
135        E = "É", e = "é",
136        I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
137        L = "Ĺ", l = "ĺ",
138        N = "Ń", n = "ń",
139        O = "Ó", o = "ó",
140        R = "Ŕ", r = "ŕ",
141        S = "Ś", s = "ś",
142        U = "Ú", u = "ú",
143        Y = "Ý", y = "ý",
144        Z = "Ź", z = "ź",
145    },
146    ["."] = { [""] = "˙",
147        C = "Ċ", c = "ċ",
148        E = "Ė", e = "ė",
149        G = "Ġ", g = "ġ",
150        I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
151        Z = "Ż", z = "ż",
152    },
153    ["="] = { [""] = "¯",
154        A = "Ā", a = "ā",
155        E = "Ē", e = "ē",
156        I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
157        O = "Ō", o = "ō",
158        U = "Ū", u = "ū",
159    },
160    ["H"] = { [""] = "˝",
161        O = "Ő", o = "ő",
162        U = "Ű", u = "ű",
163    },
164    ["^"] = { [""] = "ˆ",
165        A = "Â", a = "â",
166        C = "Ĉ", c = "ĉ",
167        E = "Ê", e = "ê",
168        G = "Ĝ", g = "ĝ",
169        H = "Ĥ", h = "ĥ",
170        I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
171        J = "Ĵ", j = "ĵ",
172        O = "Ô", o = "ô",
173        S = "Ŝ", s = "ŝ",
174        U = "Û", u = "û",
175        W = "Ŵ", w = "ŵ",
176        Y = "Ŷ", y = "ŷ",
177    },
178    ["`"] = { [""] = "`",
179        A = "À", a = "à",
180        E = "È", e = "è",
181        I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
182        O = "Ò", o = "ò",
183        U = "Ù", u = "ù",
184        Y = "", y = "",
185    },
186    ["c"] = { [""] = "¸",
187        C = "Ç", c = "ç",
188        K = "Ķ", k = "ķ",
189        L = "Ļ", l = "ļ",
190        N = "Ņ", n = "ņ",
191        R = "Ŗ", r = "ŗ",
192        S = "Ş", s = "ş",
193        T = "Ţ", t = "ţ",
194    },
195    ["k"] = { [""] = "˛",
196        A = "Ą", a = "ą",
197        E = "Ę", e = "ę",
198        I = "Į", i = "į",
199        U = "Ų", u = "ų",
200    },
201    ["r"] = { [""] = "˚",
202        A = "Å", a = "å",
203        U = "Ů", u = "ů",
204    },
205    ["u"] = { [""] = "˘",
206        A = "Ă", a = "ă",
207        E = "Ĕ", e = "ĕ",
208        G = "Ğ", g = "ğ",
209        I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
210        O = "Ŏ", o = "ŏ",
211        U = "Ŭ", u = "ŭ",
212        },
213    ["v"] = { [""] = "ˇ",
214        C = "Č", c = "č",
215        D = "Ď", d = "ď",
216        E = "Ě", e = "ě",
217        L = "Ľ", l = "ľ",
218        N = "Ň", n = "ň",
219        R = "Ř", r = "ř",
220        S = "Š", s = "š",
221        T = "Ť", t = "ť",
222        Z = "Ž", z = "ž",
223        },
224    ["~"] = { [""] = "˜",
225        A = "Ã", a = "ã",
226        I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
227        N = "Ñ", n = "ñ",
228        O = "Õ", o = "õ",
229        U = "Ũ", u = "ũ",
230    },
231}
232
233texcharacters.accentmapping = accentmapping
234
235local accent_map = allocate { -- incomplete
236   ['~'] = "̃" , --  ̃ Ẽ
237   ['"'] = "̈" , --  ̈ Ë
238   ["`"] = "̀" , --  ̀ È
239   ["'"] = "́" , --  ́ É
240   ["^"] = "̂" , --  ̂ Ê
241    --  ̄ Ē
242    --  ̆ Ĕ
243    --  ̇ Ė
244    --  ̉ Ẻ
245    --  ̌ Ě
246    --  ̏ Ȅ
247    --  ̑ Ȇ
248    --  ̣ Ẹ
249    --  ̧ Ȩ
250    --  ̨ Ę
251    --  ̭ Ḙ
252    --  ̰ Ḛ
253}
254
255-- local accents = concat(table.keys(accentmapping)) -- was _map
256
257local function remap_accent(a,c,braced)
258    local m = accentmapping[a]
259    if m then
260        local n = m[c]
261        if n then
262            return n
263        end
264    end
265--     local m = accent_map[a]
266--     if m then
267--         return c .. m
268--     elseif braced then -- or #c > 0
269    if braced then -- or #c > 0
270        return "\\" .. a .. "{" .. c .. "}"
271    else
272        return "\\" .. a .. " " .. c
273    end
274end
275
276local commandmapping = allocate {
277    ["aa"] = "å", ["AA"] = "",
278    ["ae"] = "æ", ["AE"] = "Æ",
279    ["cc"] = "ç", ["CC"] = "Ç",
280    ["i"]  = "ı", ["j"]  = "ȷ",
281    ["ij"] = "ij", ["IJ"] = "IJ",
282    ["l"]  = "ł", ["L"]  = "Ł",
283    ["o"]  = "ø", ["O"]  = "Ø",
284    ["oe"] = "œ", ["OE"] = "Œ",
285    ["sz"] = "ß", ["SZ"] = "SZ", ["ss"] = "ß", ["SS"] = "ß", -- uppercase: ẞ
286}
287
288texcharacters.commandmapping = commandmapping
289
290local ligaturemapping = allocate {
291    ["''"]  = "",
292    ["``"]  = "",
293    ["--"]  = "",
294    ["---"] = "",
295}
296
297-- Older accent handling code can be found in char-def.lua but in the meantime
298-- we moved on. First the one with commands:
299
300local untex, pattern
301
302local function toutfpattern()
303    if not untex then
304        local hash = { }
305        for k, v in next, accentmapping do
306            for kk, vv in next, v do
307                if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
308                    hash[ "\\"..k.." "..kk     ] = vv
309                    hash["{\\"..k.." "..kk.."}"] = vv
310                else
311                    hash["\\" ..k     ..kk     ] = vv
312                    hash["{\\"..k     ..kk.."}"] = vv
313                end
314                hash["\\" ..k.."{"..kk.."}" ] = vv
315                hash["{\\"..k.."{"..kk.."}}"] = vv
316            end
317        end
318        for k, v in next, commandmapping do
319            hash["\\"..k.." "] = v
320            hash["{\\"..k.."}"] = v
321            hash["{\\"..k.." }"] = v
322        end
323        for k, v in next, ligaturemapping do
324            hash[k] = v
325        end
326        untex = utfchartabletopattern(hash) / hash
327    end
328    return untex
329end
330
331local function prepare()
332    pattern = Cs((toutfpattern() + P(1))^0)
333    return pattern
334end
335
336local function textoutf(str,strip)
337    if str == "" then
338        return str
339    elseif not find(str,"\\",1,true) then
340        return str
341 -- elseif strip then
342    else
343        return lpegmatch(pattern or prepare(),str)
344    end
345end
346
347texcharacters.toutfpattern = toutfpattern
348texcharacters.toutf        = textoutf
349
350-- print(texcharacters.toutf([[\~{Z}]],true))
351-- print(texcharacters.toutf([[\'\i]],true))
352-- print(texcharacters.toutf([[\'{\i}]],true))
353-- print(texcharacters.toutf([[\"{e}]],true))
354-- print(texcharacters.toutf([[\" {e}]],true))
355-- print(texcharacters.toutf([[{\"{e}}]],true))
356-- print(texcharacters.toutf([[{\" {e}}]],true))
357-- print(texcharacters.toutf([[{\l}]],true))
358-- print(texcharacters.toutf([[{\l }]],true))
359-- print(texcharacters.toutf([[\v{r}]],true))
360-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
361-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
362
363-- Next the ones without backslash
364
365local untex, pattern
366
367local function toutfpattern()
368    if not untex then
369        local hash = { }
370        for k, v in next, accentmapping do
371            for kk, vv in next, v do
372                hash[k..kk] = vv
373            end
374        end
375        for k, v in next, commandmapping do
376            hash[k] = v
377        end
378        for k, v in next, ligaturemapping do
379            hash[k] = v
380        end
381        untex = utfchartabletopattern(hash) / hash
382    end
383    return untex
384end
385
386local function prepare()
387    pattern = Cs((toutfpattern() + P(1))^0)
388    return pattern
389end
390
391local function textoutf(str)
392    return lpegmatch(pattern or prepare(),str)
393end
394
395texcharacters.strtoutfpattern = toutfpattern
396texcharacters.strtextoutf     = textoutf
397
398local collapse = utffilters.collapse
399local combine  = utffilters.combine
400
401if not interfaces then return end
402
403local implement = interfaces.implement
404
405local pattern
406
407local verbosemarks = characters.verbosemarks
408
409if verbosemarks then
410
411    mark(verbosemarks)
412
413else
414
415    verbosemarks = allocate {
416        ["stroke"]               = utfchar(0x02F), ["slash"]        = utfchar(0x02F),
417        ["middle dot"]           = utfchar(0x0B7),
418
419        ["grave"]                = utfchar(0x300),
420        ["acute"]                = utfchar(0x301),
421        ["circumflex"]           = utfchar(0x302),
422        ["tilde"]                = utfchar(0x303),
423        ["macron"]               = utfchar(0x304), ["line"]         = utfchar(0x304),
424        ["overline"]             = utfchar(0x305),
425        ["breve"]                = utfchar(0x306),
426        ["dot"]                  = utfchar(0x307),
427        ["dieresis"]             = utfchar(0x308), ["diaeresis"]    = utfchar(0x308),
428        ["hook"]                 = utfchar(0x309),
429        ["ring"]                 = utfchar(0x30A),
430        ["double acute"]         = utfchar(0x30B), ["hungarumlaut"] = utfchar(0x30B), -- tex speak
431        ["caron"]                = utfchar(0x30C),
432        ["vertical line"]        = utfchar(0x30D),
433        ["double vertical line"] = utfchar(0x30E),
434        ["double grave"]         = utfchar(0x30F),
435        ["inverted breve"]       = utfchar(0x311),
436        ["dot below"]            = utfchar(0x323),
437        ["ring below"]           = utfchar(0x325),
438        ["cedilla"]              = utfchar(0x327), ["comma below"]  = utfchar(0x327),
439        ["ogonek"]               = utfchar(0x328),
440        ["caron below"]          = utfchar(0x32C),
441        ["circumflex below"]     = utfchar(0x32D),
442        ["tilde below"]          = utfchar(0x330),
443        ["macron below"]         = utfchar(0x331), ["line below"]   = utfchar(0x331),
444
445        ["hook below"]           = utfchar(0x1FA9D),
446    }
447
448    characters.verbosemarks = verbosemarks
449
450    if storage then
451        storage.register("characters/verbosemarks", verbosemarks, "characters.verbosemarks")
452    end
453
454end
455
456local function prepare()
457    pattern = Cs((utfchartabletopattern(verbosemarks) / verbosemarks + lpegpatterns.space/"" + lpegpatterns.utf8character)^0)
458    return pattern
459end
460
461local hash = table.setmetatableindex(function(t,k)
462    local f = ""
463    k = lpegmatch(pattern or prepare(),k) or k
464    local v = collapse(k) or k -- char specials
465-- print("collapse",k,v)
466    if k ~= v then
467        goto DONE
468    end
469    v = combine(k) or k -- with specials
470-- print("combine",k,v)
471    if k ~= v then
472        goto DONE
473    end
474    v = commandmapping[k] or k
475-- print("command",k,v)
476    if k ~= v then
477        f = "\\"
478        goto DONE
479    end
480    v = textoutf(k) or k
481-- print("utf",k,v)
482    if k ~= v then
483        f = "\\"
484        goto DONE
485    end
486  ::DONE::
487    report_defining("instead of old school '%s%s' you can input the utf sequence %s",f,k,v)
488    t[k] = v
489    return v
490end)
491
492implement {
493    name      = "chr",
494    arguments = "argument",
495    public    = true,
496    actions   = function(str)
497        context(hash[str]) -- expandable
498    end
499}
500
501function texcharacters.safechar(n) -- was characters.safechar
502    local c = data[n]
503    if c and c.contextname then
504        return "\\" .. c.contextname
505    else
506        return utfchar(n)
507    end
508end
509
510if not context or not commands then
511    -- used in e.g. mtx-bibtex
512    return
513end
514
515-- all kind of initializations
516
517local tex           = tex
518local texsetlccode  = tex.setlccode
519local texsetsfcode  = tex.setsfcode
520local texsetcatcode = tex.setcatcode
521
522local contextsprint = context.sprint
523local ctxcatcodes   = catcodes.numbers.ctxcatcodes
524
525local texsetmacro   = tokens.setters.macro
526local texsetchar    = tokens.setters.char
527
528-- function texcharacters.defineaccents()
529--     local ctx_dodefineaccentcommand  = context.dodefineaccent
530--     local ctx_dodefineaccent         = context.dodefineaccent
531--     local ctx_dodefinecommand        = context.dodefinecommand
532--     for accent, group in next, accentmapping do
533--         ctx_dodefineaccentcommand(accent)
534--         for character, mapping in next, group do
535--             ctx_dodefineaccent(accent,character,mapping)
536--         end
537--     end
538--     for command, mapping in next, commandmapping do
539--         ctx_dodefinecommand(command,mapping)
540--     end
541--     os.exit()
542-- end
543
544function texcharacters.defineaccents()
545    local ctx_dodefinecombine = context.dodefinecombine
546    local ctx_dodefinecommand = context.dodefinecommand
547    for verbose, mark in next, verbosemarks do
548        ctx_dodefinecombine((gsub(verbose," ","")),mark)
549    end
550    for command, mapping in next, commandmapping do
551        ctx_dodefinecommand(command,mapping)
552    end
553end
554
555implement { -- a waste of scanner but consistent
556    name    = "defineaccents",
557    actions = texcharacters.defineaccents
558}
559
560--[[ldx--
561<p>Instead of using a <l n='tex'/> file to define the named glyphs, we
562use the table. After all, we have this information available anyway.</p>
563--ldx]]--
564
565local function to_number(s)
566    local n = tonumber(s)
567    if n then
568        return n
569    end
570    return tonumber(match(s,'^"(.*)$'),16) or 0
571end
572
573implement {
574    name      = "utfchar",
575    actions   = { to_number, utfchar, contextsprint },
576    arguments = "string"
577}
578
579implement {
580    name      = "safechar",
581    actions   = { to_number, texcharacters.safechar, contextsprint },
582    arguments = "string"
583}
584
585implement {
586    name      = "uchar",
587    arguments = { "integer", "integer" },
588    actions   = function(h,l)
589        context(utfchar(h*256+l))
590    end
591}
592
593tex.uprint = commands.utfchar
594
595-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
596-- so at some point we might drop this
597
598-- The following get set at the TeX end:
599
600local forbidden = tohash {
601    0x000A0, -- zs nobreakspace            <self>
602    0x000AD, -- cf softhyphen              <self>
603 -- 0x00600, -- cf arabicnumber            <self>
604 -- 0x00601, -- cf arabicsanah             <self>
605 -- 0x00602, -- cf arabicfootnotemarker    <self>
606 -- 0x00603, -- cf arabicsafha             <self>
607 -- 0x00604, -- cf arabicsamvat            <self>
608 -- 0x00605, -- cf arabicnumberabove       <self>
609 -- 0x0061C, -- cf arabiclettermark        <self>
610 -- 0x006DD, -- cf arabicendofayah         <self>
611 -- 0x008E2, -- cf arabicdisputedendofayah <self>
612    0x02000, -- zs enquad                  <self>
613    0x02001, -- zs emquad                  <self>
614    0x02002, -- zs enspace                 \kern .5\emwidth
615    0x02003, -- zs emspace                 \hskip \emwidth
616    0x02004, -- zs threeperemspace         <self>
617    0x02005, -- zs fourperemspace          <self>
618    0x02006, -- zs sixperemspace           <self>
619    0x02007, -- zs figurespace             <self>
620    0x02008, -- zs punctuationspace        <self>
621    0x02009, -- zs breakablethinspace      <self>
622    0x0200A, -- zs hairspace               <self>
623    0x0200B, -- cf zerowidthspace          <self>
624    0x0200C, -- cf zwnj                    <self>
625    0x0200D, -- cf zwj                     <self>
626    0x0202F, -- zs narrownobreakspace      <self>
627    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
628 -- 0x03000, -- zs ideographicspace        <self>
629 -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
630}
631
632local csletters = characters.csletters -- also a signal that we have initialized
633local activated = { }
634local sfmode    = "unset" -- unset, traditional, normal
635local block_too = false
636
637directives.register("characters.blockstoo",function(v) block_too = v end)
638
639-- If this is something that is not documentwide and used a lot, then we
640-- need a more clever approach (trivial but not now).
641
642local function setuppersfcodes(v,n)
643    if sfstate ~= "unset" then
644        report_defining("setting uppercase sf codes to %a",n)
645        for u, chr in next, data do
646            if chr.category == "lu" then
647                texsetsfcode(u,n)
648            end
649        end
650    end
651    sfstate = v
652end
653
654directives.register("characters.spaceafteruppercase",function(v)
655    if v == "traditional" then
656        setuppersfcodes(v,999)
657    elseif v == "normal" then
658        setuppersfcodes(v,1000)
659    end
660end)
661
662if not csletters then
663
664    csletters            = allocate()
665    characters.csletters = csletters
666
667    report_defining("setting up character related codes and commands")
668
669    if sfstate == "unset" then
670        sfstate = "traditional"
671    end
672
673    local traditional = sfstate == "traditional"
674
675    for u, chr in next, data do -- will move up
676        local contextname = chr.contextname
677        local category    = chr.category
678        local isletter    = is_letter[category]
679        if contextname then
680            if is_character[category] then
681                if chr.unicodeslot < 128 then
682                    if isletter then
683                        local c = utfchar(u)
684                        csletters[c] = u
685                    end
686                else
687                    local c = utfchar(u)
688                    if isletter and u >= 32 and u <= 65536 then
689                        csletters[c] = u
690                    end
691                end
692                if isletter then
693                    local lc = chr.lccode
694                    local uc = chr.uccode
695                    if not lc then
696                        chr.lccode = u
697                        lc = u
698                    elseif type(lc) == "table" then
699                        lc = u
700                    end
701                    if not uc then
702                        chr.uccode = u
703                        uc = u
704                    elseif type(uc) == "table" then
705                        uc = u
706                    end
707                    texsetlccode(u,lc,uc)
708                    if traditional and category == "lu" then
709                        texsetsfcode(code,999)
710                    end
711                end
712            elseif is_command[category] and not forbidden[u] then
713                -- skip
714            elseif is_mark[category] then
715                texsetlccode(u,u,u) -- for hyphenation
716            end
717        elseif isletter then
718            csletters[utfchar(u)] = u
719            local lc, uc = chr.lccode, chr.uccode
720            if not lc then
721                chr.lccode = u
722                lc = u
723            elseif type(lc) == "table" then
724                lc = u
725            end
726            if not uc then
727                chr.uccode = u
728                uc = u
729            elseif type(uc) == "table" then
730                uc = u
731            end
732            texsetlccode(u,lc,uc)
733            if traditional and category == "lu" then
734                texsetsfcode(code,999)
735            end
736        elseif is_mark[category] then
737            texsetlccode(u,u,u) -- for hyphenation
738        end
739    end
740
741    if blocks_too then
742        -- this slows down format generation by over 10 percent
743        for k, v in next, blocks do
744            if v.catcode == "letter" then
745                local first = v.first
746                local last  = v.last
747                local gaps  = v.gaps
748                if first and last then
749                    for u=first,last do
750                        csletters[utfchar(u)] = u
751                        --
752                     -- texsetlccode(u,u,u) -- self self
753                        --
754                    end
755                end
756                if gaps then
757                    for i=1,#gaps do
758                        local u = gaps[i]
759                        csletters[utfchar(u)] = u
760                        --
761                     -- texsetlccode(u,u,u) -- self self
762                        --
763                    end
764                end
765            end
766        end
767    end
768
769    if storage then
770        storage.register("characters/csletters", csletters, "characters.csletters")
771    end
772
773    function characters.setcharacternames(ctt)
774        for u, chr in next, data do -- will move up
775            local contextname = chr.contextname
776            local category    = chr.category
777            local isletter    = is_letter[category]
778            if contextname then
779                if is_character[category] then
780                    if chr.unicodeslot < 128 then
781                        if isletter then
782                            texsetmacro(contextname,utfchar(u),"immutable")
783                        else
784                            texsetchar(contextname,u,"immutable")
785                        end
786                    else
787                        texsetmacro(contextname,utfchar(u),"immutable")
788                    end
789                elseif is_command[category] and not forbidden[u] then
790                    texsetmacro(contextname,utfchar(u),"immutable")
791                end
792            end
793        end
794    end
795
796else
797    mark(csletters)
798end
799
800lpegpatterns.csletter = utfchartabletopattern(csletters)
801
802-- The engine presets the letters to 11 (always).
803
804function characters.setlettercatcodes(cct)
805    if trace_defining then
806        report_defining("assigning letter catcodes to catcode table %a",cct)
807    end
808    local saved = tex.catcodetable
809    tex.catcodetable = cct
810    texsetcatcode(0x200C,11) -- non-joiner
811    texsetcatcode(0x200D,11) -- joiner
812    for c, u in next, csletters do
813        texsetcatcode(u,11)
814    end
815    tex.catcodetable = saved
816end
817
818function characters.setothercatcodes(cct)
819    if trace_defining then
820        report_defining("assigning other catcodes to catcode table %a",cct)
821    end
822    local saved = tex.catcodetable
823    tex.catcodetable = cct
824    for u=65,90 do
825        texsetcatcode(u,12)
826    end
827    for u=97,122 do
828        texsetcatcode(u,12)
829    end
830    tex.catcodetable = saved
831end
832
833function characters.setactivecatcodes(cct)
834    local saved = tex.catcodetable
835    tex.catcodetable = cct
836    for i=1,#activated do
837        local u = activated[i]
838        texsetcatcode(u,13)
839        if trace_defining then
840            report_defining("character %U (%s) is active in set %a",u,data[u].description,cct)
841        end
842    end
843    tex.catcodetable = saved
844end
845
846--[[ldx--
847<p>Setting the lccodes is also done in a loop over the data table.</p>
848--ldx]]--
849
850implement {
851    name      = "chardescription",
852    arguments = "integer",
853    actions   = function(slot)
854        local d = data[slot]
855        if d then
856            context(d.description)
857        end
858    end,
859}
860
861if characters.setcharacternames then -- only in ini mode
862
863    implement { name = "setlettercatcodes", scope = "private", actions = characters.setlettercatcodes, arguments = "integer" }
864    implement { name = "setothercatcodes",  scope = "private", actions = characters.setothercatcodes,  arguments = "integer" }
865    implement { name = "setactivecatcodes", scope = "private", actions = characters.setactivecatcodes, arguments = "integer" }
866    implement { name = "setcharacternames", scope = "private", actions = characters.setcharacternames, arguments = "integer" }
867
868end
869
870-- experiment (some can move to char-ini.lua)
871
872local function overload(c,u,code,codes)
873    local c = tonumber(c)
874    if not c then
875        return
876    end
877    local u = utilities.parsers.settings_to_array(u)
878    local n = #u
879    if n == 0 then
880        return
881    end
882    local t = nil
883    if n == 1 then
884        t = tonumber(u[1])
885    else
886        t = { }
887        for i=1,n do
888            t[#t+1] = tonumber(u[i])
889        end
890    end
891    if t then
892        data[c][code] = t
893        characters[codes][c] = nil
894    end
895end
896
897implement {
898    name      = "overloaduppercase",
899    arguments = "2 strings",
900    actions   = function(c,u)
901        overload(c,u,"uccode","uccodes")
902    end
903}
904
905implement {
906    name      = "overloadlowercase",
907    arguments = "2 strings",
908    actions   = function(c,u)
909        overload(c,u,"lccode","lccodes")
910    end
911}
912
913-- Just for fun we support keywords:
914--
915-- \startTEXpage[offset=10pt]
916--     abg"
917--     \sl \showboxes
918--     \accent               `" h%
919--     \accent               `" x%
920--     \accent yoffset  .2ex `" x
921--     \accent yoffset 1.1ex `x x%
922-- \stopTEXpage
923--
924-- We could do this:
925--
926-- \startTEXpage[offset=10pt]
927--     abg"
928--     \sl \showboxes
929--     \withaccent               `" h%
930--     \withaccent               `" x%
931--     \withaccent yoffset  .2ex `" x
932--     \withaccent yoffset 1.1ex accent `x base `x%
933-- \stopTEXpage
934--
935-- But only when users demand it:
936--
937-- do
938--
939--     local new_glyph = nodes.pool.glyph
940--
941--     local scankeyword   = tokens.scanners.keyword
942--     local scaninteger   = tokens.scanners.integer
943--     local scandimension = tokens.scanners.dimension
944--     local scantoken     = tokens.scanners.token
945--
946--     implement {
947--         name      = "withaccent",
948--         public    = true,
949--         protected = true,
950--         actions   = function()
951--             local xoffset = 0
952--             local yoffset = 0
953--             local accent  = false
954--             local base    = false
955--             local zwj     = 0x200D
956--             while true do
957--                 if scankeyword("xoffset") then
958--                     xoffset = scandimension()
959--                 elseif scankeyword("yoffset") then
960--                     yoffset = scandimension()
961--                 elseif scankeyword("accent") then
962--                     accent = scaninteger()
963--                 elseif scankeyword("base") then
964--                     base = scaninteger()
965--                 else
966--                     break
967--                 end
968--             end
969--             if not accent then
970--                 accent = scaninteger()
971--             end
972--             if not base then
973--                 local nxttok = scantoken()
974--                 base = nxttok.cmdname == "char_number" and scaninteger() or nxttok.index
975--             end
976--             if base and accent and base > 0 and accent > 0 then
977--                 base   = new_glyph(true,base)
978--                 zwj    = new_glyph(true,zwj)
979--                 accent = new_glyph(true,accent)
980--                 local slant   = fonts.hashes.parameters[true].slant / 65536 -- a la tex
981--                 local xheight = fonts.hashes.parameters[true].xheight -- hm, compensated for glyphscale?
982--                 accent.xoffset = xoffset - .5*(base.width -accent.width) + .5*(base.height-accent.height) * slant
983--                 accent.yoffset = yoffset - (xheight - accent.height)
984--                 accent.left    = accent.width
985--                 accent.options = accent.options | 0x40 | 0x80
986--                 context.dontleavehmode()
987--                 context(base)
988--                 context(zwj)
989--                 context(accent)
990--             end
991--         end,
992--     }
993--
994-- end
995