char-tex.lua /size: 25 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['char-tex'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local lpeg = lpeg
10local tonumber, next, type = tonumber, next, type
11local format, find, gmatch, match = string.format, string.find, string.gmatch, string.match
12local utfchar, utfbyte = utf.char, utf.byte
13local concat, tohash = table.concat, table.tohash
14local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
15
16local lpegpatterns          = lpeg.patterns
17local lpegmatch             = lpeg.match
18local utfchartabletopattern = lpeg.utfchartabletopattern
19
20local allocate              = utilities.storage.allocate
21local mark                  = utilities.storage.mark
22
23local context               = context
24local commands              = commands
25
26local characters            = characters
27local texcharacters         = { }
28characters.tex              = texcharacters
29local utffilters            = characters.filters.utf
30
31local is_character          = characters.is_character
32local is_letter             = characters.is_letter
33local is_command            = characters.is_command
34local is_spacing            = characters.is_spacing
35local is_mark               = characters.is_mark
36local is_punctuation        = characters.is_punctuation
37
38local data                  = characters.data  if not data then return end
39local blocks                = characters.blocks
40
41local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
42
43local report_defining       = logs.reporter("characters")
44
45-- In order to deal with 8-bit output, we need to find a way to go from UTF to
46-- 8-bit. This is handled in the 32 bit engine itself. This leaves us problems with
47-- characters that are specific to TeX, like curly braces and dollars. We can remap
48-- some chars that tex input files are sensitive for to a private area (while
49-- writing to a utility file) and revert then to their original slot when we read in
50-- such a file. Instead of reverting, we can (when we resolve characters to glyphs)
51-- map them to their right glyph there. For this purpose we can use the private
52-- planes 0x0F0000 and 0x100000.
53
54local low     = allocate()
55local high    = allocate()
56local escapes = allocate()
57local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
58
59local private = {
60    low     = low,
61    high    = high,
62    escapes = escapes,
63}
64
65utffilters.private = private
66
67for chr in gmatch(special,".") do
68    local cb, ch
69    if type(chr) == "number" then
70        ch = utfchar(chr)
71        cb = chr
72    else
73        ch = chr
74        cb = utfbyte(chr)
75    end
76    if cb < 256 then
77        escapes[ch] = "\\" .. ch
78        low[ch] = utfchar(0x0F0000 + cb)
79        if ch == "%" then
80            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
81        end
82        high[utfchar(0x0F0000 + cb)] = ch
83    end
84end
85
86local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
87local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
88
89lpegpatterns.utftohigh = tohigh
90lpegpatterns.utftolow  = tolow
91
92function utffilters.harden(str)
93    return lpegmatch(tohigh,str)
94end
95
96function utffilters.soften(str)
97    return lpegmatch(tolow,str)
98end
99
100private.escape  = utf.remapper(escapes) -- maybe: ,"dynamic"
101private.replace = utf.remapper(low)     -- maybe: ,"dynamic"
102private.revert  = utf.remapper(high)    -- maybe: ,"dynamic"
103
104local accentmapping = allocate {
105    ['"'] = { [""] = "¨",
106        A = "Ä", a = "ä",
107        E = "Ë", e = "ë",
108        I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
109        O = "Ö", o = "ö",
110        U = "Ü", u = "ü",
111        Y = "Ÿ", y = "ÿ",
112    },
113    ["'"] = { [""] = "´",
114        A = "Á", a = "á",
115        C = "Ć", c = "ć",
116        E = "É", e = "é",
117        I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
118        L = "Ĺ", l = "ĺ",
119        N = "Ń", n = "ń",
120        O = "Ó", o = "ó",
121        R = "Ŕ", r = "ŕ",
122        S = "Ś", s = "ś",
123        U = "Ú", u = "ú",
124        Y = "Ý", y = "ý",
125        Z = "Ź", z = "ź",
126    },
127    ["."] = { [""] = "˙",
128        C = "Ċ", c = "ċ",
129        E = "Ė", e = "ė",
130        G = "Ġ", g = "ġ",
131        I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
132        Z = "Ż", z = "ż",
133    },
134    ["="] = { [""] = "¯",
135        A = "Ā", a = "ā",
136        E = "Ē", e = "ē",
137        I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
138        O = "Ō", o = "ō",
139        U = "Ū", u = "ū",
140    },
141    ["H"] = { [""] = "˝",
142        O = "Ő", o = "ő",
143        U = "Ű", u = "ű",
144    },
145    ["^"] = { [""] = "ˆ",
146        A = "Â", a = "â",
147        C = "Ĉ", c = "ĉ",
148        E = "Ê", e = "ê",
149        G = "Ĝ", g = "ĝ",
150        H = "Ĥ", h = "ĥ",
151        I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
152        J = "Ĵ", j = "ĵ",
153        O = "Ô", o = "ô",
154        S = "Ŝ", s = "ŝ",
155        U = "Û", u = "û",
156        W = "Ŵ", w = "ŵ",
157        Y = "Ŷ", y = "ŷ",
158    },
159    ["`"] = { [""] = "`",
160        A = "À", a = "à",
161        E = "È", e = "è",
162        I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
163        O = "Ò", o = "ò",
164        U = "Ù", u = "ù",
165        Y = "", y = "",
166    },
167    ["c"] = { [""] = "¸",
168        C = "Ç", c = "ç",
169        K = "Ķ", k = "ķ",
170        L = "Ļ", l = "ļ",
171        N = "Ņ", n = "ņ",
172        R = "Ŗ", r = "ŗ",
173        S = "Ş", s = "ş",
174        T = "Ţ", t = "ţ",
175    },
176    ["k"] = { [""] = "˛",
177        A = "Ą", a = "ą",
178        E = "Ę", e = "ę",
179        I = "Į", i = "į",
180        U = "Ų", u = "ų",
181    },
182    ["r"] = { [""] = "˚",
183        A = "Å", a = "å",
184        U = "Ů", u = "ů",
185    },
186    ["u"] = { [""] = "˘",
187        A = "Ă", a = "ă",
188        E = "Ĕ", e = "ĕ",
189        G = "Ğ", g = "ğ",
190        I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
191        O = "Ŏ", o = "ŏ",
192        U = "Ŭ", u = "ŭ",
193        },
194    ["v"] = { [""] = "ˇ",
195        C = "Č", c = "č",
196        D = "Ď", d = "ď",
197        E = "Ě", e = "ě",
198        L = "Ľ", l = "ľ",
199        N = "Ň", n = "ň",
200        R = "Ř", r = "ř",
201        S = "Š", s = "š",
202        T = "Ť", t = "ť",
203        Z = "Ž", z = "ž",
204        },
205    ["~"] = { [""] = "˜",
206        A = "Ã", a = "ã",
207        I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
208        N = "Ñ", n = "ñ",
209        O = "Õ", o = "õ",
210        U = "Ũ", u = "ũ",
211    },
212}
213
214texcharacters.accentmapping = accentmapping
215
216local accent_map = allocate { -- incomplete
217   ['~'] = "̃" , --  ̃ Ẽ
218   ['"'] = "̈" , --  ̈ Ë
219   ["`"] = "̀" , --  ̀ È
220   ["'"] = "́" , --  ́ É
221   ["^"] = "̂" , --  ̂ Ê
222    --  ̄ Ē
223    --  ̆ Ĕ
224    --  ̇ Ė
225    --  ̉ Ẻ
226    --  ̌ Ě
227    --  ̏ Ȅ
228    --  ̑ Ȇ
229    --  ̣ Ẹ
230    --  ̧ Ȩ
231    --  ̨ Ę
232    --  ̭ Ḙ
233    --  ̰ Ḛ
234}
235
236-- local accents = concat(table.keys(accentmapping)) -- was _map
237
238local function remap_accent(a,c,braced)
239    local m = accentmapping[a]
240    if m then
241        local n = m[c]
242        if n then
243            return n
244        end
245    end
246--     local m = accent_map[a]
247--     if m then
248--         return c .. m
249--     elseif braced then -- or #c > 0
250    if braced then -- or #c > 0
251        return "\\" .. a .. "{" .. c .. "}"
252    else
253        return "\\" .. a .. " " .. c
254    end
255end
256
257local commandmapping = allocate {
258    ["aa"] = "å", ["AA"] = "",
259    ["ae"] = "æ", ["AE"] = "Æ",
260    ["cc"] = "ç", ["CC"] = "Ç",
261    ["i"]  = "ı", ["j"]  = "ȷ",
262    ["ij"] = "ij", ["IJ"] = "IJ",
263    ["l"]  = "ł", ["L"]  = "Ł",
264    ["o"]  = "ø", ["O"]  = "Ø",
265    ["oe"] = "œ", ["OE"] = "Œ",
266    ["sz"] = "ß", ["SZ"] = "SZ", ["ss"] = "ß", ["SS"] = "ß", -- uppercase: ẞ
267}
268
269texcharacters.commandmapping = commandmapping
270
271local ligaturemapping = allocate {
272    ["''"]  = "",
273    ["``"]  = "",
274    ["--"]  = "",
275    ["---"] = "",
276}
277
278-- local achar    = R("az","AZ") + P("ı") + P("\\i")
279--
280-- local spaces   = P(" ")^0
281-- local no_l     = P("{") / ""
282-- local no_r     = P("}") / ""
283-- local no_b     = P('\\') / ""
284--
285-- local lUr      = P("{") * C(achar) * P("}")
286--
287-- local accents_1 = [["'.=^`~]]
288-- local accents_2 = [[Hckruv]]
289--
290-- local accent   = P('\\') * (
291--     C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up
292--     C(S(accents_2)) *  lUr * Cc(true)
293-- ) / remap_accent
294--
295-- local csname  = P('\\') * C(R("az","AZ")^1)
296--
297-- local command  = (
298--     csname +
299--     P("{") * csname * spaces * P("}")
300-- ) / commandmapping -- remap_commands
301--
302-- local both_1 = Cs { "run",
303--     accent  = accent,
304--     command = command,
305--     run     = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0,
306-- }
307--
308-- local both_2 = Cs { "run",
309--     accent  = accent,
310--     command = command,
311--     run     = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0,
312-- }
313--
314-- function texcharacters.toutf(str,strip)
315--     if not find(str,"\\",1,true) then
316--         return str
317--     elseif strip then
318--         return lpegmatch(both_1,str)
319--     else
320--         return lpegmatch(both_2,str)
321--     end
322-- end
323
324local untex
325
326local function toutfpattern()
327    if not untex then
328        local hash = { }
329        for k, v in next, accentmapping do
330            for kk, vv in next, v do
331                if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
332                    hash[ "\\"..k.." "..kk     ] = vv
333                    hash["{\\"..k.." "..kk.."}"] = vv
334                else
335                    hash["\\" ..k     ..kk     ] = vv
336                    hash["{\\"..k     ..kk.."}"] = vv
337                end
338                hash["\\" ..k.."{"..kk.."}" ] = vv
339                hash["{\\"..k.."{"..kk.."}}"] = vv
340            end
341        end
342        for k, v in next, commandmapping do
343            hash["\\"..k.." "] = v
344            hash["{\\"..k.."}"] = v
345            hash["{\\"..k.." }"] = v
346        end
347        for k, v in next, ligaturemapping do
348            hash[k] = v
349        end
350        untex = utfchartabletopattern(hash) / hash
351    end
352    return untex
353end
354
355texcharacters.toutfpattern = toutfpattern
356
357local pattern = nil
358
359local function prepare()
360    pattern = Cs((toutfpattern() + P(1))^0)
361    return pattern
362end
363
364function texcharacters.toutf(str,strip)
365    if str == "" then
366        return str
367    elseif not find(str,"\\",1,true) then
368        return str
369 -- elseif strip then
370    else
371        return lpegmatch(pattern or prepare(),str)
372    end
373end
374
375-- print(texcharacters.toutf([[\~{Z}]],true))
376-- print(texcharacters.toutf([[\'\i]],true))
377-- print(texcharacters.toutf([[\'{\i}]],true))
378-- print(texcharacters.toutf([[\"{e}]],true))
379-- print(texcharacters.toutf([[\" {e}]],true))
380-- print(texcharacters.toutf([[{\"{e}}]],true))
381-- print(texcharacters.toutf([[{\" {e}}]],true))
382-- print(texcharacters.toutf([[{\l}]],true))
383-- print(texcharacters.toutf([[{\l }]],true))
384-- print(texcharacters.toutf([[\v{r}]],true))
385-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
386-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
387
388function texcharacters.safechar(n) -- was characters.safechar
389    local c = data[n]
390    if c and c.contextname then
391        return "\\" .. c.contextname
392    else
393        return utfchar(n)
394    end
395end
396
397if not context or not commands then
398    -- used in e.g. mtx-bibtex
399    return
400end
401
402-- all kind of initializations
403
404if not interfaces then return end
405
406local implement     = interfaces.implement
407
408local tex           = tex
409local texsetlccode  = tex.setlccode
410local texsetsfcode  = tex.setsfcode
411local texsetcatcode = tex.setcatcode
412
413local contextsprint = context.sprint
414local ctxcatcodes   = catcodes.numbers.ctxcatcodes
415
416local texsetmacro   = tokens.setters.macro
417local texsetchar    = tokens.setters.char
418
419function texcharacters.defineaccents()
420    local ctx_dodefineaccentcommand = context.dodefineaccentcommand
421    local ctx_dodefineaccent        = context.dodefineaccent
422    local ctx_dodefinecommand       = context.dodefinecommand
423    for accent, group in next, accentmapping do
424        ctx_dodefineaccentcommand(accent)
425        for character, mapping in next, group do
426            ctx_dodefineaccent(accent,character,mapping)
427        end
428    end
429    for command, mapping in next, commandmapping do
430        ctx_dodefinecommand(command,mapping)
431    end
432end
433
434implement { -- a waste of scanner but consistent
435    name    = "defineaccents",
436    actions = texcharacters.defineaccents
437}
438
439-- Instead of using a TeX file to define the named glyphs, we use the table. After
440-- all, we have this information available anyway.
441
442function commands.makeactive(n,name) -- not used
443    contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
444 -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
445end
446
447local function to_number(s)
448    local n = tonumber(s)
449    if n then
450        return n
451    end
452    return tonumber(match(s,'^"(.*)$'),16) or 0
453end
454
455implement {
456    name      = "utfchar",
457    actions   = { to_number, utfchar, contextsprint },
458    arguments = "string"
459}
460
461implement {
462    name      = "safechar",
463    actions   = { to_number, texcharacters.safechar, contextsprint },
464    arguments = "string"
465}
466
467implement {
468    name      = "uchar",
469    arguments = { "integer", "integer" },
470    actions   = function(h,l)
471        context(utfchar(h*256+l))
472    end
473}
474
475tex.uprint = commands.utfchar
476
477-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
478-- so at some point we might drop this
479
480-- The following get set at the TeX end:
481
482local forbidden = tohash {
483    0x000A0, -- zs nobreakspace            <self>
484    0x000AD, -- cf softhyphen              <self>
485 -- 0x00600, -- cf arabicnumber            <self>
486 -- 0x00601, -- cf arabicsanah             <self>
487 -- 0x00602, -- cf arabicfootnotemarker    <self>
488 -- 0x00603, -- cf arabicsafha             <self>
489 -- 0x00604, -- cf arabicsamvat            <self>
490 -- 0x00605, -- cf arabicnumberabove       <self>
491 -- 0x0061C, -- cf arabiclettermark        <self>
492 -- 0x006DD, -- cf arabicendofayah         <self>
493 -- 0x008E2, -- cf arabicdisputedendofayah <self>
494    0x02000, -- zs enquad                  <self>
495    0x02001, -- zs emquad                  <self>
496    0x02002, -- zs enspace                 \kern .5\emwidth
497    0x02003, -- zs emspace                 \hskip \emwidth
498    0x02004, -- zs threeperemspace         <self>
499    0x02005, -- zs fourperemspace          <self>
500    0x02006, -- zs sixperemspace           <self>
501    0x02007, -- zs figurespace             <self>
502    0x02008, -- zs punctuationspace        <self>
503    0x02009, -- zs breakablethinspace      <self>
504    0x0200A, -- zs hairspace               <self>
505    0x0200B, -- cf zerowidthspace          <self>
506    0x0200C, -- cf zwnj                    <self>
507    0x0200D, -- cf zwj                     <self>
508    0x0202F, -- zs narrownobreakspace      <self>
509    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
510 -- 0x03000, -- zs ideographicspace        <self>
511 -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
512}
513
514local csletters = characters.csletters -- also a signal that we have initialized
515local activated = { }
516local sfmode    = "unset" -- unset, traditional, normal
517local block_too = false
518
519directives.register("characters.blockstoo",function(v) block_too = v end)
520
521-- If this is something that is not documentwide and used a lot, then we
522-- need a more clever approach (trivial but not now).
523
524local function setuppersfcodes(v,n)
525    if sfstate ~= "unset" then
526        report_defining("setting uppercase sf codes to %a",n)
527        for u, chr in next, data do
528            if chr.category == "lu" then
529                texsetsfcode(u,n)
530            end
531        end
532    end
533    sfstate = v
534end
535
536directives.register("characters.spaceafteruppercase",function(v)
537    if v == "traditional" then
538        setuppersfcodes(v,999)
539    elseif v == "normal" then
540        setuppersfcodes(v,1000)
541    end
542end)
543
544if not csletters then
545
546    csletters            = allocate()
547    characters.csletters = csletters
548
549    report_defining("setting up character related codes and commands")
550
551    if sfstate == "unset" then
552        sfstate = "traditional"
553    end
554
555    local traditional = sfstate == "traditional"
556
557    for u, chr in next, data do -- will move up
558        local contextname = chr.contextname
559        local category    = chr.category
560        local isletter    = is_letter[category]
561        if contextname then
562            if is_character[category] then
563                if chr.unicodeslot < 128 then
564                    if isletter then
565                        local c = utfchar(u)
566                        csletters[c] = u
567                    end
568                else
569                    local c = utfchar(u)
570                    if isletter and u >= 32 and u <= 65536 then
571                        csletters[c] = u
572                    end
573                end
574                if isletter then
575                    local lc = chr.lccode
576                    local uc = chr.uccode
577                    if not lc then
578                        chr.lccode = u
579                        lc = u
580                    elseif type(lc) == "table" then
581                        lc = u
582                    end
583                    if not uc then
584                        chr.uccode = u
585                        uc = u
586                    elseif type(uc) == "table" then
587                        uc = u
588                    end
589                    texsetlccode(u,lc,uc)
590                    if traditional and category == "lu" then
591                        texsetsfcode(code,999)
592                    end
593                end
594            elseif is_command[category] and not forbidden[u] then
595                -- skip
596            elseif is_mark[category] then
597                texsetlccode(u,u,u) -- for hyphenation
598            end
599        elseif isletter then
600            csletters[utfchar(u)] = u
601            local lc, uc = chr.lccode, chr.uccode
602            if not lc then
603                chr.lccode = u
604                lc = u
605            elseif type(lc) == "table" then
606                lc = u
607            end
608            if not uc then
609                chr.uccode = u
610                uc = u
611            elseif type(uc) == "table" then
612                uc = u
613            end
614            texsetlccode(u,lc,uc)
615            if traditional and category == "lu" then
616                texsetsfcode(code,999)
617            end
618        elseif is_mark[category] then
619            texsetlccode(u,u,u) -- for hyphenation
620        end
621    end
622
623    if blocks_too then
624        -- this slows down format generation by over 10 percent
625        for k, v in next, blocks do
626            if v.catcode == "letter" then
627                local first = v.first
628                local last  = v.last
629                local gaps  = v.gaps
630                if first and last then
631                    for u=first,last do
632                        csletters[utfchar(u)] = u
633                        --
634                     -- texsetlccode(u,u,u) -- self self
635                        --
636                    end
637                end
638                if gaps then
639                    for i=1,#gaps do
640                        local u = gaps[i]
641                        csletters[utfchar(u)] = u
642                        --
643                     -- texsetlccode(u,u,u) -- self self
644                        --
645                    end
646                end
647            end
648        end
649    end
650
651    if storage then
652        storage.register("characters/csletters", csletters, "characters.csletters")
653    end
654
655    function characters.setcharacternames(ctt)
656        for u, chr in next, data do -- will move up
657            local contextname = chr.contextname
658            local category    = chr.category
659            local isletter    = is_letter[category]
660            if contextname then
661                if is_character[category] then
662                    if chr.unicodeslot < 128 then
663                        if isletter then
664                            texsetmacro(contextname,utfchar(u),"immutable")
665                        else
666                            texsetchar(contextname,u,"immutable")
667                        end
668                    else
669                        texsetmacro(contextname,utfchar(u),"immutable")
670                    end
671                elseif is_command[category] and not forbidden[u] then
672                    texsetmacro(contextname,utfchar(u),"immutable")
673                end
674            end
675        end
676    end
677
678else
679    mark(csletters)
680end
681
682lpegpatterns.csletter = utfchartabletopattern(csletters)
683
684-- todo: get rid of activated
685-- todo: move first loop out ,merge with above
686
687function characters.setlettercatcodes(cct)
688    if trace_defining then
689        report_defining("assigning letter catcodes to catcode table %a",cct)
690    end
691    local saved = tex.catcodetable
692    tex.catcodetable = cct
693    texsetcatcode(0x200C,11) -- non-joiner
694    texsetcatcode(0x200D,11) -- joiner
695    for c, u in next, csletters do
696        texsetcatcode(u,11)
697    end
698 -- for u, chr in next, data do
699 --     if not chr.fallback and is_letter[chr.category] and u >= 32 and u <= 65536 then
700 --         texsetcatcode(u,11)
701 --     end
702 --     local range = chr.range
703 --     if range then
704 --         for i=1,range.first,range.last do -- tricky as not all are letters
705 --             texsetcatcode(i,11)
706 --         end
707 --     end
708 -- end
709 -- for k, v in next, blocks do
710 --     if v.catcode == "letter" then
711 --         for u=v.first,v.last do
712 --             texsetcatcode(u,11)
713 --         end
714 --     end
715 -- end
716    tex.catcodetable = saved
717end
718
719function characters.setactivecatcodes(cct)
720    local saved = tex.catcodetable
721    tex.catcodetable = cct
722    for i=1,#activated do
723        local u = activated[i]
724        texsetcatcode(u,13)
725        if trace_defining then
726            report_defining("character %U (%s) is active in set %a",u,data[u].description,cct)
727        end
728    end
729    tex.catcodetable = saved
730end
731
732-- -- Setting the lccodes is also done in a loop over the data table.
733
734-- function characters.setcodes() -- we could loop over csletters
735--     if trace_defining then
736--         report_defining("defining lc and uc codes")
737--     end
738--     local traditional = sfstate == "traditional" or sfstate == "unset"
739--     for code, chr in next, data do
740--         local cc = chr.category
741--         if is_letter[cc] then
742--             local range = chr.range
743--             if range then
744--                 for i=range.first,range.last do
745--                     texsetlccode(i,i,i) -- self self
746--                 end
747--             else
748--                 local lc, uc = chr.lccode, chr.uccode
749--                 if not lc then
750--                     chr.lccode, lc = code, code
751--                 elseif type(lc) == "table" then
752--                     lc = code
753--                 end
754--                 if not uc then
755--                     chr.uccode, uc = code, code
756--                 elseif type(uc) == "table" then
757--                     uc = code
758--                 end
759--                 texsetlccode(code,lc,uc)
760--                 if traditional and cc == "lu" then
761--                     texsetsfcode(code,999)
762--                 end
763--             end
764--         elseif is_mark[cc] then
765--             texsetlccode(code,code,code) -- for hyphenation
766--         end
767--     end
768--     if traditional then
769--         sfstate = "traditional"
770--     end
771-- end
772
773-- tex
774
775implement {
776    name      = "chardescription",
777    arguments = "integer",
778    actions   = function(slot)
779        local d = data[slot]
780        if d then
781            context(d.description)
782        end
783    end,
784}
785
786-- xml
787
788characters.activeoffset = 0x10000 -- there will be remapped in that byte range
789
790function commands.remapentity(chr,slot) -- not used
791    contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
792end
793
794-- xml.entities = xml.entities or { }
795--
796-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
797--
798-- function characters.setmkiventities()
799--     local entities = xml.entities
800--     entities.lt  = "<"
801--     entities.amp = "&"
802--     entities.gt  = ">"
803-- end
804--
805-- function characters.setmkiientities()
806--     local entities = xml.entities
807--     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
808--     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
809--     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
810-- end
811
812if characters.setcharacternames then -- only in ini mode
813
814    implement { name = "setlettercatcodes", scope = "private", actions = characters.setlettercatcodes, arguments = "integer" }
815    implement { name = "setactivecatcodes", scope = "private", actions = characters.setactivecatcodes, arguments = "integer" }
816    implement { name = "setcharacternames", scope = "private", actions = characters.setcharacternames, arguments = "integer" }
817
818end
819
820-- experiment (some can move to char-ini.lua)
821
822local function overload(c,u,code,codes)
823    local c = tonumber(c)
824    if not c then
825        return
826    end
827    local u = utilities.parsers.settings_to_array(u)
828    local n = #u
829    if n == 0 then
830        return
831    end
832    local t = nil
833    if n == 1 then
834        t = tonumber(u[1])
835    else
836        t = { }
837        for i=1,n do
838            t[#t+1] = tonumber(u[i])
839        end
840    end
841    if t then
842        data[c][code] = t
843        characters[codes][c] = nil
844    end
845end
846
847interfaces.implement {
848    name      = "overloaduppercase",
849    arguments = "2 strings",
850    actions   = function(c,u)
851        overload(c,u,"uccode","uccodes")
852    end
853}
854
855interfaces.implement {
856    name      = "overloadlowercase",
857    arguments = "2 strings",
858    actions   = function(c,u)
859        overload(c,u,"lccode","lccodes")
860    end
861}
862