char-tex.lua /size: 25 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['char-tex'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local lpeg = lpeg
10local tonumber, next, type = tonumber, next, type
11local format, find, gmatch, match = string.format, string.find, string.gmatch, string.match
12local utfchar, utfbyte = utf.char, utf.byte
13local concat, tohash = table.concat, table.tohash
14local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
15
16local lpegpatterns          = lpeg.patterns
17local lpegmatch             = lpeg.match
18local utfchartabletopattern = lpeg.utfchartabletopattern
19
20local allocate              = utilities.storage.allocate
21local mark                  = utilities.storage.mark
22
23local context               = context
24local commands              = commands
25
26local characters            = characters
27local texcharacters         = { }
28characters.tex              = texcharacters
29local utffilters            = characters.filters.utf
30
31local is_character          = characters.is_character
32local is_letter             = characters.is_letter
33local is_command            = characters.is_command
34local is_spacing            = characters.is_spacing
35local is_mark               = characters.is_mark
36local is_punctuation        = characters.is_punctuation
37
38local data                  = characters.data  if not data then return end
39local blocks                = characters.blocks
40
41local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
42
43local report_defining       = logs.reporter("characters")
44
45-- In order to deal with 8-bit output, we need to find a way to go from UTF to
46-- 8-bit. This is handled in the 32 bit engine itself. This leaves us problems with
47-- characters that are specific to TeX, like curly braces and dollars. We can remap
48-- some chars that tex input files are sensitive for to a private area (while
49-- writing to a utility file) and revert then to their original slot when we read in
50-- such a file. Instead of reverting, we can (when we resolve characters to glyphs)
51-- map them to their right glyph there. For this purpose we can use the private
52-- planes 0x0F0000 and 0x100000.
53
54local low     = allocate()
55local high    = allocate()
56local escapes = allocate()
57local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
58
59local private = {
60    low     = low,
61    high    = high,
62    escapes = escapes,
63}
64
65utffilters.private = private
66
67for ch in gmatch(special,".") do
68    local cb
69    if type(ch) == "number" then
70        cb, ch = ch, utfchar(ch)
71    else
72        cb = utfbyte(ch)
73    end
74    if cb < 256 then
75        escapes[ch] = "\\" .. ch
76        low[ch] = utfchar(0x0F0000 + cb)
77        if ch == "%" then
78            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
79        end
80        high[utfchar(0x0F0000 + cb)] = ch
81    end
82end
83
84local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
85local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
86
87lpegpatterns.utftohigh = tohigh
88lpegpatterns.utftolow  = tolow
89
90function utffilters.harden(str)
91    return lpegmatch(tohigh,str)
92end
93
94function utffilters.soften(str)
95    return lpegmatch(tolow,str)
96end
97
98private.escape  = utf.remapper(escapes) -- maybe: ,"dynamic"
99private.replace = utf.remapper(low)     -- maybe: ,"dynamic"
100private.revert  = utf.remapper(high)    -- maybe: ,"dynamic"
101
102local accentmapping = allocate {
103    ['"'] = { [""] = "¨",
104        A = "Ä", a = "ä",
105        E = "Ë", e = "ë",
106        I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
107        O = "Ö", o = "ö",
108        U = "Ü", u = "ü",
109        Y = "Ÿ", y = "ÿ",
110    },
111    ["'"] = { [""] = "´",
112        A = "Á", a = "á",
113        C = "Ć", c = "ć",
114        E = "É", e = "é",
115        I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
116        L = "Ĺ", l = "ĺ",
117        N = "Ń", n = "ń",
118        O = "Ó", o = "ó",
119        R = "Ŕ", r = "ŕ",
120        S = "Ś", s = "ś",
121        U = "Ú", u = "ú",
122        Y = "Ý", y = "ý",
123        Z = "Ź", z = "ź",
124    },
125    ["."] = { [""] = "˙",
126        C = "Ċ", c = "ċ",
127        E = "Ė", e = "ė",
128        G = "Ġ", g = "ġ",
129        I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
130        Z = "Ż", z = "ż",
131    },
132    ["="] = { [""] = "¯",
133        A = "Ā", a = "ā",
134        E = "Ē", e = "ē",
135        I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
136        O = "Ō", o = "ō",
137        U = "Ū", u = "ū",
138    },
139    ["H"] = { [""] = "˝",
140        O = "Ő", o = "ő",
141        U = "Ű", u = "ű",
142    },
143    ["^"] = { [""] = "ˆ",
144        A = "Â", a = "â",
145        C = "Ĉ", c = "ĉ",
146        E = "Ê", e = "ê",
147        G = "Ĝ", g = "ĝ",
148        H = "Ĥ", h = "ĥ",
149        I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
150        J = "Ĵ", j = "ĵ",
151        O = "Ô", o = "ô",
152        S = "Ŝ", s = "ŝ",
153        U = "Û", u = "û",
154        W = "Ŵ", w = "ŵ",
155        Y = "Ŷ", y = "ŷ",
156    },
157    ["`"] = { [""] = "`",
158        A = "À", a = "à",
159        E = "È", e = "è",
160        I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
161        O = "Ò", o = "ò",
162        U = "Ù", u = "ù",
163        Y = "", y = "",
164    },
165    ["c"] = { [""] = "¸",
166        C = "Ç", c = "ç",
167        K = "Ķ", k = "ķ",
168        L = "Ļ", l = "ļ",
169        N = "Ņ", n = "ņ",
170        R = "Ŗ", r = "ŗ",
171        S = "Ş", s = "ş",
172        T = "Ţ", t = "ţ",
173    },
174    ["k"] = { [""] = "˛",
175        A = "Ą", a = "ą",
176        E = "Ę", e = "ę",
177        I = "Į", i = "į",
178        U = "Ų", u = "ų",
179    },
180    ["r"] = { [""] = "˚",
181        A = "Å", a = "å",
182        U = "Ů", u = "ů",
183    },
184    ["u"] = { [""] = "˘",
185        A = "Ă", a = "ă",
186        E = "Ĕ", e = "ĕ",
187        G = "Ğ", g = "ğ",
188        I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
189        O = "Ŏ", o = "ŏ",
190        U = "Ŭ", u = "ŭ",
191        },
192    ["v"] = { [""] = "ˇ",
193        C = "Č", c = "č",
194        D = "Ď", d = "ď",
195        E = "Ě", e = "ě",
196        L = "Ľ", l = "ľ",
197        N = "Ň", n = "ň",
198        R = "Ř", r = "ř",
199        S = "Š", s = "š",
200        T = "Ť", t = "ť",
201        Z = "Ž", z = "ž",
202        },
203    ["~"] = { [""] = "˜",
204        A = "Ã", a = "ã",
205        I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
206        N = "Ñ", n = "ñ",
207        O = "Õ", o = "õ",
208        U = "Ũ", u = "ũ",
209    },
210}
211
212texcharacters.accentmapping = accentmapping
213
214local accent_map = allocate { -- incomplete
215   ['~'] = "̃" , --  ̃ Ẽ
216   ['"'] = "̈" , --  ̈ Ë
217   ["`"] = "̀" , --  ̀ È
218   ["'"] = "́" , --  ́ É
219   ["^"] = "̂" , --  ̂ Ê
220    --  ̄ Ē
221    --  ̆ Ĕ
222    --  ̇ Ė
223    --  ̉ Ẻ
224    --  ̌ Ě
225    --  ̏ Ȅ
226    --  ̑ Ȇ
227    --  ̣ Ẹ
228    --  ̧ Ȩ
229    --  ̨ Ę
230    --  ̭ Ḙ
231    --  ̰ Ḛ
232}
233
234-- local accents = concat(table.keys(accentmapping)) -- was _map
235
236local function remap_accent(a,c,braced)
237    local m = accentmapping[a]
238    if m then
239        local n = m[c]
240        if n then
241            return n
242        end
243    end
244--     local m = accent_map[a]
245--     if m then
246--         return c .. m
247--     elseif braced then -- or #c > 0
248    if braced then -- or #c > 0
249        return "\\" .. a .. "{" .. c .. "}"
250    else
251        return "\\" .. a .. " " .. c
252    end
253end
254
255local commandmapping = allocate {
256    ["aa"] = "å", ["AA"] = "",
257    ["ae"] = "æ", ["AE"] = "Æ",
258    ["cc"] = "ç", ["CC"] = "Ç",
259    ["i"]  = "ı", ["j"]  = "ȷ",
260    ["ij"] = "ij", ["IJ"] = "IJ",
261    ["l"]  = "ł", ["L"]  = "Ł",
262    ["o"]  = "ø", ["O"]  = "Ø",
263    ["oe"] = "œ", ["OE"] = "Œ",
264    ["sz"] = "ß", ["SZ"] = "SZ", ["ss"] = "ß", ["SS"] = "ß", -- uppercase: ẞ
265}
266
267texcharacters.commandmapping = commandmapping
268
269local ligaturemapping = allocate {
270    ["''"]  = "",
271    ["``"]  = "",
272    ["--"]  = "",
273    ["---"] = "",
274}
275
276-- local achar    = R("az","AZ") + P("ı") + P("\\i")
277--
278-- local spaces   = P(" ")^0
279-- local no_l     = P("{") / ""
280-- local no_r     = P("}") / ""
281-- local no_b     = P('\\') / ""
282--
283-- local lUr      = P("{") * C(achar) * P("}")
284--
285-- local accents_1 = [["'.=^`~]]
286-- local accents_2 = [[Hckruv]]
287--
288-- local accent   = P('\\') * (
289--     C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up
290--     C(S(accents_2)) *  lUr * Cc(true)
291-- ) / remap_accent
292--
293-- local csname  = P('\\') * C(R("az","AZ")^1)
294--
295-- local command  = (
296--     csname +
297--     P("{") * csname * spaces * P("}")
298-- ) / commandmapping -- remap_commands
299--
300-- local both_1 = Cs { "run",
301--     accent  = accent,
302--     command = command,
303--     run     = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0,
304-- }
305--
306-- local both_2 = Cs { "run",
307--     accent  = accent,
308--     command = command,
309--     run     = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0,
310-- }
311--
312-- function texcharacters.toutf(str,strip)
313--     if not find(str,"\\",1,true) then
314--         return str
315--     elseif strip then
316--         return lpegmatch(both_1,str)
317--     else
318--         return lpegmatch(both_2,str)
319--     end
320-- end
321
322local untex
323
324local function toutfpattern()
325    if not untex then
326        local hash = { }
327        for k, v in next, accentmapping do
328            for kk, vv in next, v do
329                if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
330                    hash[ "\\"..k.." "..kk     ] = vv
331                    hash["{\\"..k.." "..kk.."}"] = vv
332                else
333                    hash["\\" ..k     ..kk     ] = vv
334                    hash["{\\"..k     ..kk.."}"] = vv
335                end
336                hash["\\" ..k.."{"..kk.."}" ] = vv
337                hash["{\\"..k.."{"..kk.."}}"] = vv
338            end
339        end
340        for k, v in next, commandmapping do
341            hash["\\"..k.." "] = v
342            hash["{\\"..k.."}"] = v
343            hash["{\\"..k.." }"] = v
344        end
345        for k, v in next, ligaturemapping do
346            hash[k] = v
347        end
348        untex = utfchartabletopattern(hash) / hash
349    end
350    return untex
351end
352
353texcharacters.toutfpattern = toutfpattern
354
355local pattern = nil
356
357local function prepare()
358    pattern = Cs((toutfpattern() + P(1))^0)
359    return pattern
360end
361
362function texcharacters.toutf(str,strip)
363    if str == "" then
364        return str
365    elseif not find(str,"\\",1,true) then
366        return str
367 -- elseif strip then
368    else
369        return lpegmatch(pattern or prepare(),str)
370    end
371end
372
373-- print(texcharacters.toutf([[\~{Z}]],true))
374-- print(texcharacters.toutf([[\'\i]],true))
375-- print(texcharacters.toutf([[\'{\i}]],true))
376-- print(texcharacters.toutf([[\"{e}]],true))
377-- print(texcharacters.toutf([[\" {e}]],true))
378-- print(texcharacters.toutf([[{\"{e}}]],true))
379-- print(texcharacters.toutf([[{\" {e}}]],true))
380-- print(texcharacters.toutf([[{\l}]],true))
381-- print(texcharacters.toutf([[{\l }]],true))
382-- print(texcharacters.toutf([[\v{r}]],true))
383-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
384-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
385
386function texcharacters.safechar(n) -- was characters.safechar
387    local c = data[n]
388    if c and c.contextname then
389        return "\\" .. c.contextname
390    else
391        return utfchar(n)
392    end
393end
394
395if not context or not commands then
396    -- used in e.g. mtx-bibtex
397    return
398end
399
400-- all kind of initializations
401
402if not interfaces then return end
403
404local implement     = interfaces.implement
405
406local tex           = tex
407local texsetlccode  = tex.setlccode
408local texsetsfcode  = tex.setsfcode
409local texsetcatcode = tex.setcatcode
410
411local contextsprint = context.sprint
412local ctxcatcodes   = catcodes.numbers.ctxcatcodes
413
414local texsetmacro   = tokens.setters.macro
415local texsetchar    = tokens.setters.char
416
417function texcharacters.defineaccents()
418    local ctx_dodefineaccentcommand = context.dodefineaccentcommand
419    local ctx_dodefineaccent        = context.dodefineaccent
420    local ctx_dodefinecommand       = context.dodefinecommand
421    for accent, group in next, accentmapping do
422        ctx_dodefineaccentcommand(accent)
423        for character, mapping in next, group do
424            ctx_dodefineaccent(accent,character,mapping)
425        end
426    end
427    for command, mapping in next, commandmapping do
428        ctx_dodefinecommand(command,mapping)
429    end
430end
431
432implement { -- a waste of scanner but consistent
433    name    = "defineaccents",
434    actions = texcharacters.defineaccents
435}
436
437-- Instead of using a TeX file to define the named glyphs, we use the table. After
438-- all, we have this information available anyway.
439
440function commands.makeactive(n,name) -- not used
441    contextsprint(ctxcatcodes,format("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name))
442 -- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
443end
444
445local function to_number(s)
446    local n = tonumber(s)
447    if n then
448        return n
449    end
450    return tonumber(match(s,'^"(.*)$'),16) or 0
451end
452
453implement {
454    name      = "utfchar",
455    actions   = { to_number, utfchar, contextsprint },
456    arguments = "string"
457}
458
459implement {
460    name      = "safechar",
461    actions   = { to_number, texcharacters.safechar, contextsprint },
462    arguments = "string"
463}
464
465implement {
466    name      = "uchar",
467    arguments = { "integer", "integer" },
468    actions   = function(h,l)
469        context(utfchar(h*256+l))
470    end
471}
472
473tex.uprint = commands.utfchar
474
475-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
476-- so at some point we might drop this
477
478-- The following get set at the TeX end:
479
480local forbidden = tohash {
481    0x000A0, -- zs nobreakspace            <self>
482    0x000AD, -- cf softhyphen              <self>
483 -- 0x00600, -- cf arabicnumber            <self>
484 -- 0x00601, -- cf arabicsanah             <self>
485 -- 0x00602, -- cf arabicfootnotemarker    <self>
486 -- 0x00603, -- cf arabicsafha             <self>
487 -- 0x00604, -- cf arabicsamvat            <self>
488 -- 0x00605, -- cf arabicnumberabove       <self>
489 -- 0x0061C, -- cf arabiclettermark        <self>
490 -- 0x006DD, -- cf arabicendofayah         <self>
491 -- 0x008E2, -- cf arabicdisputedendofayah <self>
492    0x02000, -- zs enquad                  <self>
493    0x02001, -- zs emquad                  <self>
494    0x02002, -- zs enspace                 \kern .5\emwidth
495    0x02003, -- zs emspace                 \hskip \emwidth
496    0x02004, -- zs threeperemspace         <self>
497    0x02005, -- zs fourperemspace          <self>
498    0x02006, -- zs sixperemspace           <self>
499    0x02007, -- zs figurespace             <self>
500    0x02008, -- zs punctuationspace        <self>
501    0x02009, -- zs breakablethinspace      <self>
502    0x0200A, -- zs hairspace               <self>
503    0x0200B, -- cf zerowidthspace          <self>
504    0x0200C, -- cf zwnj                    <self>
505    0x0200D, -- cf zwj                     <self>
506    0x0202F, -- zs narrownobreakspace      <self>
507    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
508 -- 0x03000, -- zs ideographicspace        <self>
509 -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
510}
511
512local csletters = characters.csletters -- also a signal that we have initialized
513local activated = { }
514local sfmode    = "unset" -- unset, traditional, normal
515local block_too = false
516
517directives.register("characters.blockstoo",function(v) block_too = v end)
518
519-- If this is something that is not documentwide and used a lot, then we
520-- need a more clever approach (trivial but not now).
521
522local function setuppersfcodes(v,n)
523    if sfstate ~= "unset" then
524        report_defining("setting uppercase sf codes to %a",n)
525        for u, chr in next, data do
526            if chr.category == "lu" then
527                texsetsfcode(u,n)
528            end
529        end
530    end
531    sfstate = v
532end
533
534directives.register("characters.spaceafteruppercase",function(v)
535    if v == "traditional" then
536        setuppersfcodes(v,999)
537    elseif v == "normal" then
538        setuppersfcodes(v,1000)
539    end
540end)
541
542if not csletters then
543
544    csletters            = allocate()
545    characters.csletters = csletters
546
547    report_defining("setting up character related codes and commands")
548
549    if sfstate == "unset" then
550        sfstate = "traditional"
551    end
552
553    local traditional = sfstate == "traditional"
554
555    for u, chr in next, data do -- will move up
556        local contextname = chr.contextname
557        local category    = chr.category
558        local isletter    = is_letter[category]
559        if contextname then
560            if is_character[category] then
561                if chr.unicodeslot < 128 then
562                    if isletter then
563                        local c = utfchar(u)
564                        csletters[c] = u
565                    end
566                else
567                    local c = utfchar(u)
568                    if isletter and u >= 32 and u <= 65536 then
569                        csletters[c] = u
570                    end
571                end
572                if isletter then
573                    local lc = chr.lccode
574                    local uc = chr.uccode
575                    if not lc then
576                        chr.lccode = u
577                        lc = u
578                    elseif type(lc) == "table" then
579                        lc = u
580                    end
581                    if not uc then
582                        chr.uccode = u
583                        uc = u
584                    elseif type(uc) == "table" then
585                        uc = u
586                    end
587                    texsetlccode(u,lc,uc)
588                    if traditional and category == "lu" then
589                        texsetsfcode(code,999)
590                    end
591                end
592            elseif is_command[category] and not forbidden[u] then
593                -- skip
594            elseif is_mark[category] then
595                texsetlccode(u,u,u) -- for hyphenation
596            end
597        elseif isletter then
598            csletters[utfchar(u)] = u
599            local lc, uc = chr.lccode, chr.uccode
600            if not lc then
601                chr.lccode = u
602                lc = u
603            elseif type(lc) == "table" then
604                lc = u
605            end
606            if not uc then
607                chr.uccode = u
608                uc = u
609            elseif type(uc) == "table" then
610                uc = u
611            end
612            texsetlccode(u,lc,uc)
613            if traditional and category == "lu" then
614                texsetsfcode(code,999)
615            end
616        elseif is_mark[category] then
617            texsetlccode(u,u,u) -- for hyphenation
618        end
619    end
620
621    if blocks_too then
622        -- this slows down format generation by over 10 percent
623        for k, v in next, blocks do
624            if v.catcode == "letter" then
625                local first = v.first
626                local last  = v.last
627                local gaps  = v.gaps
628                if first and last then
629                    for u=first,last do
630                        csletters[utfchar(u)] = u
631                        --
632                     -- texsetlccode(u,u,u) -- self self
633                        --
634                    end
635                end
636                if gaps then
637                    for i=1,#gaps do
638                        local u = gaps[i]
639                        csletters[utfchar(u)] = u
640                        --
641                     -- texsetlccode(u,u,u) -- self self
642                        --
643                    end
644                end
645            end
646        end
647    end
648
649    if storage then
650        storage.register("characters/csletters", csletters, "characters.csletters")
651    end
652
653    function characters.setcharacternames(ctt)
654        for u, chr in next, data do -- will move up
655            local contextname = chr.contextname
656            local category    = chr.category
657            local isletter    = is_letter[category]
658            if contextname then
659                if is_character[category] then
660                    if chr.unicodeslot < 128 then
661                        if isletter then
662                            texsetmacro(contextname,utfchar(u),"immutable")
663                        else
664                            texsetchar(contextname,u,"immutable")
665                        end
666                    else
667                        texsetmacro(contextname,utfchar(u),"immutable")
668                    end
669                elseif is_command[category] and not forbidden[u] then
670                    texsetmacro(contextname,utfchar(u),"immutable")
671                end
672            end
673        end
674    end
675
676else
677    mark(csletters)
678end
679
680lpegpatterns.csletter = utfchartabletopattern(csletters)
681
682-- todo: get rid of activated
683-- todo: move first loop out ,merge with above
684
685function characters.setlettercatcodes(cct)
686    if trace_defining then
687        report_defining("assigning letter catcodes to catcode table %a",cct)
688    end
689    local saved = tex.catcodetable
690    tex.catcodetable = cct
691    texsetcatcode(0x200C,11) -- non-joiner
692    texsetcatcode(0x200D,11) -- joiner
693    for c, u in next, csletters do
694        texsetcatcode(u,11)
695    end
696 -- for u, chr in next, data do
697 --     if not chr.fallback and is_letter[chr.category] and u >= 32 and u <= 65536 then
698 --         texsetcatcode(u,11)
699 --     end
700 --     local range = chr.range
701 --     if range then
702 --         for i=1,range.first,range.last do -- tricky as not all are letters
703 --             texsetcatcode(i,11)
704 --         end
705 --     end
706 -- end
707 -- for k, v in next, blocks do
708 --     if v.catcode == "letter" then
709 --         for u=v.first,v.last do
710 --             texsetcatcode(u,11)
711 --         end
712 --     end
713 -- end
714    tex.catcodetable = saved
715end
716
717function characters.setactivecatcodes(cct)
718    local saved = tex.catcodetable
719    tex.catcodetable = cct
720    for i=1,#activated do
721        local u = activated[i]
722        texsetcatcode(u,13)
723        if trace_defining then
724            report_defining("character %U (%s) is active in set %a",u,data[u].description,cct)
725        end
726    end
727    tex.catcodetable = saved
728end
729
730-- -- Setting the lccodes is also done in a loop over the data table.
731
732-- function characters.setcodes() -- we could loop over csletters
733--     if trace_defining then
734--         report_defining("defining lc and uc codes")
735--     end
736--     local traditional = sfstate == "traditional" or sfstate == "unset"
737--     for code, chr in next, data do
738--         local cc = chr.category
739--         if is_letter[cc] then
740--             local range = chr.range
741--             if range then
742--                 for i=range.first,range.last do
743--                     texsetlccode(i,i,i) -- self self
744--                 end
745--             else
746--                 local lc, uc = chr.lccode, chr.uccode
747--                 if not lc then
748--                     chr.lccode, lc = code, code
749--                 elseif type(lc) == "table" then
750--                     lc = code
751--                 end
752--                 if not uc then
753--                     chr.uccode, uc = code, code
754--                 elseif type(uc) == "table" then
755--                     uc = code
756--                 end
757--                 texsetlccode(code,lc,uc)
758--                 if traditional and cc == "lu" then
759--                     texsetsfcode(code,999)
760--                 end
761--             end
762--         elseif is_mark[cc] then
763--             texsetlccode(code,code,code) -- for hyphenation
764--         end
765--     end
766--     if traditional then
767--         sfstate = "traditional"
768--     end
769-- end
770
771-- tex
772
773implement {
774    name      = "chardescription",
775    arguments = "integer",
776    actions   = function(slot)
777        local d = data[slot]
778        if d then
779            context(d.description)
780        end
781    end,
782}
783
784-- xml
785
786characters.activeoffset = 0x10000 -- there will be remapped in that byte range
787
788function commands.remapentity(chr,slot) -- not used
789    contextsprint(format("{\\catcode%s=13\\xdef%s{\\string%s}}",slot,utfchar(slot),chr))
790end
791
792-- xml.entities = xml.entities or { }
793--
794-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
795--
796-- function characters.setmkiventities()
797--     local entities = xml.entities
798--     entities.lt  = "<"
799--     entities.amp = "&"
800--     entities.gt  = ">"
801-- end
802--
803-- function characters.setmkiientities()
804--     local entities = xml.entities
805--     entities.lt  = utfchar(characters.activeoffset + utfbyte("<"))
806--     entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
807--     entities.gt  = utfchar(characters.activeoffset + utfbyte(">"))
808-- end
809
810if characters.setcharacternames then -- only in ini mode
811
812    implement { name = "setlettercatcodes", scope = "private", actions = characters.setlettercatcodes, arguments = "integer" }
813    implement { name = "setactivecatcodes", scope = "private", actions = characters.setactivecatcodes, arguments = "integer" }
814    implement { name = "setcharacternames", scope = "private", actions = characters.setcharacternames, arguments = "integer" }
815
816end
817
818-- experiment (some can move to char-ini.lua)
819
820local function overload(c,u,code,codes)
821    local c = tonumber(c)
822    if not c then
823        return
824    end
825    local u = utilities.parsers.settings_to_array(u)
826    local n = #u
827    if n == 0 then
828        return
829    end
830    local t = nil
831    if n == 1 then
832        t = tonumber(u[1])
833    else
834        t = { }
835        for i=1,n do
836            t[#t+1] = tonumber(u[i])
837        end
838    end
839    if t then
840        data[c][code] = t
841        characters[codes][c] = nil
842    end
843end
844
845interfaces.implement {
846    name      = "overloaduppercase",
847    arguments = "2 strings",
848    actions   = function(c,u)
849        overload(c,u,"uccode","uccodes")
850    end
851}
852
853interfaces.implement {
854    name      = "overloadlowercase",
855    arguments = "2 strings",
856    actions   = function(c,u)
857        overload(c,u,"lccode","lccodes")
858    end
859}
860