SourceBrowser

char-tex.lmt /size: 32 Kb last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['char-tex'] = {
2    version   = 1.001,
3    comment   = "companion to char-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local lpeg = lpeg
10local tonumber, next, type = tonumber, next, type
11local format, find, gmatch, match, gsub = string.format, string.find, string.gmatch, string.match, string.gsub
12local utfchar, utfbyte = utf.char, utf.byte
13local concat, tohash = table.concat, table.tohash
14local P, C, R, S, V, Cs, Cc = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.V, lpeg.Cs, lpeg.Cc
15
16local lpegpatterns          = lpeg.patterns
17local lpegmatch             = lpeg.match
18local utfchartabletopattern = lpeg.utfchartabletopattern
19
20local allocate              = utilities.storage.allocate
21local mark                  = utilities.storage.mark
22
23local context               = context
24local commands              = commands
25
26if not characters then require("char-ini") require("char-utf")  end
27
28local characters            = characters
29local texcharacters         = { }
30characters.tex              = texcharacters
31local utffilters            = characters.filters.utf
32
33local allocate              = utilities.storage.allocate or function() return { } end
34local mark                  = utilities.storage.mark     or allocate
35
36local is_character          = characters.is_character
37local is_letter             = characters.is_letter
38local is_command            = characters.is_command
39local is_spacing            = characters.is_spacing
40local is_mark               = characters.is_mark
41
42local data                  = characters.data  if not data then return end
43local blocks                = characters.blocks
44
45local trace_defining        = false  trackers.register("characters.defining", function(v) characters_defining = v end)
46
47local report_defining       = logs.reporter("characters")
48
49-- In order to deal with 8-bit output, we need to find a way to go from UTF to
50-- 8-bit. This is handled in the 32 bit engine itself. This leaves us problems with
51-- characters that are specific to TeX, like curly braces and dollars. We can remap
52-- some chars that tex input files are sensitive for to a private area (while
53-- writing to a utility file) and revert then to their original slot when we read in
54-- such a file. Instead of reverting, we can (when we resolve characters to glyphs)
55-- map them to their right glyph there. For this purpose we can use the private
56-- planes 0x0F0000 and 0x100000.
57
58local low     = allocate()
59local high    = allocate()
60local escapes = allocate()
61local special = "~#$%^&_{}\\|" -- "~#$%{}\\|"
62
63local private = {
64    low     = low,
65    high    = high,
66    escapes = escapes,
67}
68
69utffilters.private = private
70
71for chr in gmatch(special,".") do
72    local cb, ch
73    if type(chr) == "number" then
74        ch = utfchar(chr)
75        cb = chr
76    else
77        ch = chr
78        cb = utfbyte(chr)
79    end
80    if cb < 256 then
81        escapes[ch] = "\\" .. ch
82        low[ch] = utfchar(0x0F0000 + cb)
83        if ch == "%" then
84            ch = "%%" -- nasty, but we need this as in replacements (also in lpeg) % is interpreted
85        end
86        high[utfchar(0x0F0000 + cb)] = ch
87    end
88end
89
90local tohigh = lpeg.replacer(low)   -- frozen, only for basic tex
91local tolow  = lpeg.replacer(high)  -- frozen, only for basic tex
92
93lpegpatterns.utftohigh = tohigh
94lpegpatterns.utftolow  = tolow
95
96function utffilters.harden(str)
97    return lpegmatch(tohigh,str)
98end
99
100function utffilters.soften(str)
101    return lpegmatch(tolow,str)
102end
103
104private.escape  = utf.remapper(escapes) -- maybe: ,"dynamic"
105private.replace = utf.remapper(low)     -- maybe: ,"dynamic"
106private.revert  = utf.remapper(high)    -- maybe: ,"dynamic"
107
108local accentmapping = allocate {
109    ['"'] = { [""] = "¨",
110        A = "Ä", a = "ä",
111        E = "Ë", e = "ë",
112        I = "Ï", i = "ï", ["ı"] = "ï", ["\\i"] = "ï",
113        O = "Ö", o = "ö",
114        U = "Ü", u = "ü",
115        Y = "Ÿ", y = "ÿ",
116    },
117    ["'"] = { [""] = "´",
118        A = "Á", a = "á",
119        C = "Ć", c = "ć",
120        E = "É", e = "é",
121        I = "Í", i = "í", ["ı"] = "í", ["\\i"] = "í",
122        L = "Ĺ", l = "ĺ",
123        N = "Ń", n = "ń",
124        O = "Ó", o = "ó",
125        R = "Ŕ", r = "ŕ",
126        S = "Ś", s = "ś",
127        U = "Ú", u = "ú",
128        Y = "Ý", y = "ý",
129        Z = "Ź", z = "ź",
130    },
131    ["."] = { [""] = "˙",
132        C = "Ċ", c = "ċ",
133        E = "Ė", e = "ė",
134        G = "Ġ", g = "ġ",
135        I = "İ", i = "i", ["ı"] = "i", ["\\i"] = "i",
136        Z = "Ż", z = "ż",
137    },
138    ["="] = { [""] = "¯",
139        A = "Ā", a = "ā",
140        E = "Ē", e = "ē",
141        I = "Ī", i = "ī", ["ı"] = "ī", ["\\i"] = "ī",
142        O = "Ō", o = "ō",
143        U = "Ū", u = "ū",
144    },
145    ["H"] = { [""] = "˝",
146        O = "Ő", o = "ő",
147        U = "Ű", u = "ű",
148    },
149    ["^"] = { [""] = "ˆ",
150        A = "Â", a = "â",
151        C = "Ĉ", c = "ĉ",
152        E = "Ê", e = "ê",
153        G = "Ĝ", g = "ĝ",
154        H = "Ĥ", h = "ĥ",
155        I = "Î", i = "î", ["ı"] = "î", ["\\i"] = "î",
156        J = "Ĵ", j = "ĵ",
157        O = "Ô", o = "ô",
158        S = "Ŝ", s = "ŝ",
159        U = "Û", u = "û",
160        W = "Ŵ", w = "ŵ",
161        Y = "Ŷ", y = "ŷ",
162    },
163    ["`"] = { [""] = "`",
164        A = "À", a = "à",
165        E = "È", e = "è",
166        I = "Ì", i = "ì", ["ı"] = "ì", ["\\i"] = "ì",
167        O = "Ò", o = "ò",
168        U = "Ù", u = "ù",
169        Y = "Ỳ", y = "ỳ",
170    },
171    ["c"] = { [""] = "¸",
172        C = "Ç", c = "ç",
173        K = "Ķ", k = "ķ",
174        L = "Ļ", l = "ļ",
175        N = "Ņ", n = "ņ",
176        R = "Ŗ", r = "ŗ",
177        S = "Ş", s = "ş",
178        T = "Ţ", t = "ţ",
179    },
180    ["k"] = { [""] = "˛",
181        A = "Ą", a = "ą",
182        E = "Ę", e = "ę",
183        I = "Į", i = "į",
184        U = "Ų", u = "ų",
185    },
186    ["r"] = { [""] = "˚",
187        A = "Å", a = "å",
188        U = "Ů", u = "ů",
189    },
190    ["u"] = { [""] = "˘",
191        A = "Ă", a = "ă",
192        E = "Ĕ", e = "ĕ",
193        G = "Ğ", g = "ğ",
194        I = "Ĭ", i = "ĭ", ["ı"] = "ĭ", ["\\i"] = "ĭ",
195        O = "Ŏ", o = "ŏ",
196        U = "Ŭ", u = "ŭ",
197        },
198    ["v"] = { [""] = "ˇ",
199        C = "Č", c = "č",
200        D = "Ď", d = "ď",
201        E = "Ě", e = "ě",
202        L = "Ľ", l = "ľ",
203        N = "Ň", n = "ň",
204        R = "Ř", r = "ř",
205        S = "Š", s = "š",
206        T = "Ť", t = "ť",
207        Z = "Ž", z = "ž",
208        },
209    ["~"] = { [""] = "˜",
210        A = "Ã", a = "ã",
211        I = "Ĩ", i = "ĩ", ["ı"] = "ĩ", ["\\i"] = "ĩ",
212        N = "Ñ", n = "ñ",
213        O = "Õ", o = "õ",
214        U = "Ũ", u = "ũ",
215    },
216}
217
218texcharacters.accentmapping = accentmapping
219
220local accent_map = allocate { -- incomplete
221   ['~'] = "̃" , --  ̃ Ẽ
222   ['"'] = "̈" , --  ̈ Ë
223   ["`"] = "̀" , --  ̀ È
224   ["'"] = "́" , --  ́ É
225   ["^"] = "̂" , --  ̂ Ê
226    --  ̄ Ē
227    --  ̆ Ĕ
228    --  ̇ Ė
229    --  ̉ Ẻ
230    --  ̌ Ě
231    --  ̏ Ȅ
232    --  ̑ Ȇ
233    --  ̣ Ẹ
234    --  ̧ Ȩ
235    --  ̨ Ę
236    --  ̭ Ḙ
237    --  ̰ Ḛ
238}
239
240-- local accents = concat(table.keys(accentmapping)) -- was _map
241
242local function remap_accent(a,c,braced)
243    local m = accentmapping[a]
244    if m then
245        local n = m[c]
246        if n then
247            return n
248        end
249    end
250--     local m = accent_map[a]
251--     if m then
252--         return c .. m
253--     elseif braced then -- or #c > 0
254    if braced then -- or #c > 0
255        return "\\" .. a .. "{" .. c .. "}"
256    else
257        return "\\" .. a .. " " .. c
258    end
259end
260
261local commandmapping = allocate {
262    ["aa"] = "å", ["AA"] = "Å",
263    ["ae"] = "æ", ["AE"] = "Æ",
264    ["cc"] = "ç", ["CC"] = "Ç",
265    ["i"]  = "ı", ["j"]  = "ȷ",
266    ["ij"] = "ĳ", ["IJ"] = "Ĳ",
267    ["l"]  = "ł", ["L"]  = "Ł",
268    ["o"]  = "ø", ["O"]  = "Ø",
269    ["oe"] = "œ", ["OE"] = "Œ",
270    ["sz"] = "ß", ["SZ"] = "SZ", ["ss"] = "ß", ["SS"] = "ß", -- uppercase: ẞ
271}
272
273texcharacters.commandmapping = commandmapping
274
275-- local ligaturemapping = allocate {
276--     ["''"]  = "”",
277--     ["``"]  = "“",
278--     ["--"]  = "–",
279--     ["---"] = "—",
280-- }
281
282-- Older accent handling code can be found in char-def.lua but in the meantime
283-- we moved on. First the one with commands:
284
285local untex, pattern
286
287local function toutfpattern()
288    if not untex then
289        local hash = { }
290        for k, v in next, accentmapping do
291            for kk, vv in next, v do
292                if (k >= "a" and k <= "z") or (k >= "A" and k <= "Z") then
293                    hash[ "\\"..k.." "..kk     ] = vv
294                    hash["{\\"..k.." "..kk.."}"] = vv
295                else
296                    hash["\\" ..k     ..kk     ] = vv
297                    hash["{\\"..k     ..kk.."}"] = vv
298                end
299                hash["\\" ..k.."{"..kk.."}" ] = vv
300                hash["{\\"..k.."{"..kk.."}}"] = vv
301            end
302        end
303        for k, v in next, commandmapping do
304            hash["\\"..k.." "] = v
305            hash["{\\"..k.."}"] = v
306            hash["{\\"..k.." }"] = v
307        end
308     -- for k, v in next, ligaturemapping do
309     --     hash[k] = v
310     -- end
311        untex = utfchartabletopattern(hash) / hash
312    end
313    return untex
314end
315
316local function prepare()
317    pattern = Cs((toutfpattern() + P(1))^0)
318    return pattern
319end
320
321local function textoutf(str,strip)
322    if str == "" then
323        return str
324    elseif not find(str,"\\",1,true) then
325        return str
326 -- elseif strip then
327    else
328        return lpegmatch(pattern or prepare(),str)
329    end
330end
331
332texcharacters.toutfpattern = toutfpattern
333texcharacters.toutf        = textoutf
334
335-- print(texcharacters.toutf([[\~{Z}]],true))
336-- print(texcharacters.toutf([[\'\i]],true))
337-- print(texcharacters.toutf([[\'{\i}]],true))
338-- print(texcharacters.toutf([[\"{e}]],true))
339-- print(texcharacters.toutf([[\" {e}]],true))
340-- print(texcharacters.toutf([[{\"{e}}]],true))
341-- print(texcharacters.toutf([[{\" {e}}]],true))
342-- print(texcharacters.toutf([[{\l}]],true))
343-- print(texcharacters.toutf([[{\l }]],true))
344-- print(texcharacters.toutf([[\v{r}]],true))
345-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
346-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
347
348-- Next the ones without backslash
349
350local untex, pattern
351
352local function toutfpattern()
353    if not untex then
354        local hash = { }
355        for k, v in next, accentmapping do
356            for kk, vv in next, v do
357                hash[k..kk] = vv
358            end
359        end
360        for k, v in next, commandmapping do
361            hash[k] = v
362        end
363     -- for k, v in next, ligaturemapping do
364     --     hash[k] = v
365     -- end
366        untex = utfchartabletopattern(hash) / hash
367    end
368    return untex
369end
370
371local function prepare()
372    pattern = Cs((toutfpattern() + P(1))^0)
373    return pattern
374end
375
376local function textoutf(str)
377    return lpegmatch(pattern or prepare(),str)
378end
379
380texcharacters.strtoutfpattern = toutfpattern
381texcharacters.strtextoutf     = textoutf
382
383local collapse = utffilters.collapse
384local combine  = utffilters.combine
385
386if not interfaces then return end
387
388local implement = interfaces.implement
389
390local pattern1, pattern2
391
392local verbosemarks = characters.verbosemarks
393
394if verbosemarks then
395
396    mark(verbosemarks)
397
398else
399
400    verbosemarks = allocate {
401        ["stroke"]               = utfchar(0x02F), ["slash"]        = utfchar(0x02F),
402        ["middle dot"]           = utfchar(0x0B7),
403
404        ["grave"]                = utfchar(0x300),
405        ["acute"]                = utfchar(0x301),
406        ["circumflex"]           = utfchar(0x302),
407        ["circumflex"]           = utfchar(0x302),
408        ["tilde"]                = utfchar(0x303),
409        ["macron"]               = utfchar(0x304), ["line"]         = utfchar(0x304),
410        ["overline"]             = utfchar(0x305),
411        ["breve"]                = utfchar(0x306),
412        ["dot"]                  = utfchar(0x307),
413        ["dieresis"]             = utfchar(0x308), ["diaeresis"]    = utfchar(0x308),
414        ["hook"]                 = utfchar(0x309),
415        ["ring"]                 = utfchar(0x30A),
416        ["double acute"]         = utfchar(0x30B), ["hungarumlaut"] = utfchar(0x30B), -- tex speak
417        ["caron"]                = utfchar(0x30C),
418        ["vertical line"]        = utfchar(0x30D),
419        ["double vertical line"] = utfchar(0x30E),
420        ["double grave"]         = utfchar(0x30F),
421        ["inverted breve"]       = utfchar(0x311),
422        ["dot below"]            = utfchar(0x323),
423        ["ring below"]           = utfchar(0x325),
424        ["comma below"]          = utfchar(0x326),
425        ["cedilla"]              = utfchar(0x327),
426        ["ogonek"]               = utfchar(0x328),
427        ["caron below"]          = utfchar(0x32C),
428        ["circumflex below"]     = utfchar(0x32D),
429        ["tilde below"]          = utfchar(0x330),
430        ["macron below"]         = utfchar(0x331), ["line below"]   = utfchar(0x331),
431
432        ["hook below"]           = utfchar(0x1FA9D),
433
434    }
435
436    characters.verbosemarks = verbosemarks
437
438    if storage then
439        storage.register("characters/verbosemarks", verbosemarks, "characters.verbosemarks")
440    end
441
442end
443
444local function prepare1()
445    pattern1 = Cs(
446        (
447P("\\")/"" * (utfchartabletopattern(commandmapping) / commandmapping) * (P(" ")/"")
448+            utfchartabletopattern(verbosemarks) / verbosemarks
449          + lpegpatterns.space/""
450          + lpegpatterns.utf8character
451        )^0
452    )
453    return pattern1
454end
455
456local function prepare2()
457    local back = {
458      ["ı"] = "i",
459      ["ȷ"] = "j",
460    }
461    pattern2 = Cs(
462        (
463            utfchartabletopattern(back) / back
464          + lpegpatterns.utf8character
465        )^0
466    )
467    return pattern2
468end
469
470local hash = table.setmetatableindex(function(t,k)
471    local f = k
472    k = lpegmatch(pattern1 or prepare1(),k) or k
473    k = lpegmatch(pattern2 or prepare2(),k) or k
474    local v = collapse(k) or k -- char specials
475    if k ~= v then
476        goto DONE
477    end
478    v = combine(k) or k -- with specials
479    if k ~= v then
480        goto DONE
481    end
482    v = commandmapping[k] or k
483    if k ~= v then
484        f = "\\" .. f
485        goto DONE
486    end
487    v = textoutf(k) or k
488    if k ~= v then
489        f = "\\" .. f
490        goto DONE
491    end
492  ::DONE::
493    report_defining("instead of old school '%s' you can input the utf sequence %s",f,v)
494    t[k] = v
495    return v
496end)
497
498implement {
499    name      = "chr",
500 -- arguments = "argument", -- not here
501    arguments = "string",
502    public    = true,
503    actions   = function(str)
504        local hsh = hash[str]
505        context(hsh) -- expandable
506    end
507}
508
509function texcharacters.safechar(n) -- was characters.safechar
510    local c = data[n]
511    if c and c.contextname then
512        return "\\" .. c.contextname
513    else
514        return utfchar(n)
515    end
516end
517
518if not context or not commands then
519    -- used in e.g. mtx-bibtex
520    return
521end
522
523-- all kind of initializations
524
525local tex           = tex
526local texsetlccode  = tex.setlccode
527local texsetsfcode  = tex.setsfcode
528local texsetcatcode = tex.setcatcode
529
530local contextsprint = context.sprint
531
532local texsetmacro   = tokens.setters.macro
533local texsetchar    = tokens.setters.char
534
535local ctxcatcodes   <const> = catcodes.numbers.ctxcatcodes
536
537-- function texcharacters.defineaccents()
538--     local ctx_dodefineaccentcommand  = context.dodefineaccent
539--     local ctx_dodefineaccent         = context.dodefineaccent
540--     local ctx_dodefinecommand        = context.dodefinecommand
541--     for accent, group in next, accentmapping do
542--         ctx_dodefineaccentcommand(accent)
543--         for character, mapping in next, group do
544--             ctx_dodefineaccent(accent,character,mapping)
545--         end
546--     end
547--     for command, mapping in next, commandmapping do
548--         ctx_dodefinecommand(command,mapping)
549--     end
550--     os.exit()
551-- end
552
553function texcharacters.defineaccents()
554    local ctx_dodefinecombine = context.dodefinecombine
555    local ctx_dodefinecommand = context.dodefinecommand
556    for verbose, mark in next, verbosemarks do
557        ctx_dodefinecombine((gsub(verbose," ","")),mark)
558    end
559    for command, mapping in next, commandmapping do
560        ctx_dodefinecommand(command,mapping)
561    end
562end
563
564implement { -- a waste of scanner but consistent
565    name    = "defineaccents",
566    actions = texcharacters.defineaccents
567}
568
569-- Instead of using a TeX file to define the named glyphs, we use the table. After
570-- all, we have this information available anyway.
571
572local function to_number(s)
573    local n = tonumber(s)
574    if n then
575        return n
576    end
577    return tonumber(match(s,'^"(.*)$'),16) or 0
578end
579
580implement {
581    name      = "utfchar",
582    actions   = { to_number, utfchar, contextsprint },
583    arguments = "string"
584}
585
586implement {
587    name      = "safechar",
588    actions   = { to_number, texcharacters.safechar, contextsprint },
589    arguments = "string"
590}
591
592implement {
593    name      = "uchar",
594    arguments = "2 integers",
595    actions   = function(h,l)
596        context(utfchar(h*256+l))
597    end
598}
599
600tex.uprint = commands.utfchar
601
602-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
603-- so at some point we might drop this
604
605-- The following get set at the TeX end:
606
607local forbidden = tohash {
608    0x000A0, -- zs nobreakspace            <self>
609    0x000AD, -- cf softhyphen              <self>
610 -- 0x00600, -- cf arabicnumber            <self>
611 -- 0x00601, -- cf arabicsanah             <self>
612 -- 0x00602, -- cf arabicfootnotemarker    <self>
613 -- 0x00603, -- cf arabicsafha             <self>
614 -- 0x00604, -- cf arabicsamvat            <self>
615 -- 0x00605, -- cf arabicnumberabove       <self>
616 -- 0x0061C, -- cf arabiclettermark        <self>
617 -- 0x006DD, -- cf arabicendofayah         <self>
618 -- 0x008E2, -- cf arabicdisputedendofayah <self>
619    0x02000, -- zs enquad                  <self>
620    0x02001, -- zs emquad                  <self>
621    0x02002, -- zs enspace                 \kern .5\emwidth
622    0x02003, -- zs emspace                 \hskip \emwidth
623    0x02004, -- zs threeperemspace         <self>
624    0x02005, -- zs fourperemspace          <self>
625    0x02006, -- zs sixperemspace           <self>
626    0x02007, -- zs figurespace             <self>
627    0x02008, -- zs punctuationspace        <self>
628    0x02009, -- zs breakablethinspace      <self>
629    0x0200A, -- zs hairspace               <self>
630    0x0200B, -- cf zerowidthspace          <self>
631    0x0200C, -- cf zwnj                    <self>
632    0x0200D, -- cf zwj                     <self>
633    0x0202F, -- zs narrownobreakspace      <self>
634    0x0205F, -- zs medspace                \textormathspace +\medmuskip 2
635 -- 0x03000, -- zs ideographicspace        <self>
636 -- 0x0FEFF, -- cf zerowidthnobreakspace   \penalty \plustenthousand \kern \zeropoint
637}
638
639local csletters = characters.csletters -- also a signal that we have initialized
640local activated = { }
641local sfstate   = "unset" -- unset, traditional, normal
642local block_too = false
643
644directives.register("characters.blockstoo",function(v) block_too = v end)
645
646-- If this is something that is not documentwide and used a lot, then we
647-- need a more clever approach (trivial but not now).
648
649local function setuppersfcodes(v,n)
650    if sfstate ~= "unset" then
651        report_defining("setting uppercase sf codes to %a",n)
652        for u, chr in next, data do
653            if chr.category == "lu" then
654                texsetsfcode(u,n)
655            end
656        end
657    end
658    sfstate = v
659end
660
661directives.register("characters.spaceafteruppercase",function(v)
662    if v == "traditional" then
663        setuppersfcodes(v,999)
664        sfstate = v
665    elseif v == "normal" then
666        setuppersfcodes(v,1000)
667        sfstate = v
668    else
669        v = tonumber(v)
670        if v and v > 0 and v < tex.magicconstants.defaultspacefactor then
671            setuppersfcodes(v,v)
672        end
673    end
674end)
675
676if not csletters then
677
678    csletters            = allocate()
679    characters.csletters = csletters
680
681    report_defining("setting up character related codes and commands")
682
683    if sfstate == "unset" then
684        sfstate = "traditional"
685    end
686
687    local traditional = sfstate == "traditional"
688    for u, chr in next, data do -- will move up
689        local contextname = chr.contextname
690        local category    = chr.category
691        local isletter    = is_letter[category]
692        if contextname then
693            if is_character[category] then
694                if chr.unicodeslot < 128 then
695                    if isletter then
696                        local c = utfchar(u)
697                        csletters[c] = u
698                    end
699                else
700                    local c = utfchar(u)
701                    if isletter and u >= 32 and u <= 65536 then
702                        csletters[c] = u
703                    end
704                end
705                if isletter then
706                    local lc = chr.lccode
707                    local uc = chr.uccode
708                    if not lc then
709                        chr.lccode = u
710                        lc = u
711                    elseif type(lc) == "table" then
712                        lc = u
713                    end
714                    if not uc then
715                        chr.uccode = u
716                        uc = u
717                    elseif type(uc) == "table" then
718                        uc = u
719                    end
720                    texsetlccode(u,lc,uc)
721                    if traditional and category == "lu" then
722                        texsetsfcode(u,999)
723                    end
724                end
725            elseif is_command[category] and not forbidden[u] then
726                -- skip
727            elseif is_mark[category] then
728                texsetlccode(u,u,u) -- for hyphenation
729            end
730        elseif isletter then
731            csletters[utfchar(u)] = u
732            local lc, uc = chr.lccode, chr.uccode
733            if not lc then
734                chr.lccode = u
735                lc = u
736            elseif type(lc) == "table" then
737                lc = u
738            end
739            if not uc then
740                chr.uccode = u
741                uc = u
742            elseif type(uc) == "table" then
743                uc = u
744            end
745            texsetlccode(u,lc,uc)
746            if traditional and category == "lu" then
747                texsetsfcode(u,999)
748            end
749        elseif is_mark[category] then
750            texsetlccode(u,u,u) -- for hyphenation
751        end
752    end
753
754    if blocks_too then
755        -- this slows down format generation by over 10 percent
756        for k, v in next, blocks do
757            if v.catcode == "letter" then
758                local first = v.first
759                local last  = v.last
760                local gaps  = v.gaps
761                if first and last then
762                    for u=first,last do
763                        csletters[utfchar(u)] = u
764                        --
765                     -- texsetlccode(u,u,u) -- self self
766                        --
767                    end
768                end
769                if gaps then
770                    for i=1,#gaps do
771                        local u = gaps[i]
772                        csletters[utfchar(u)] = u
773                        --
774                     -- texsetlccode(u,u,u) -- self self
775                        --
776                    end
777                end
778            end
779        end
780    end
781
782    if storage then
783        storage.register("characters/csletters", csletters, "characters.csletters")
784    end
785
786 -- These can go now:
787 --
788 -- .acute .apostrophe .bar .breve .breveacute .brevedotbelow .brevegrave .brevehook
789 -- .brevetilde .caron .cedilla .circumflex .circumflexacute .circumflexdotbelow
790 -- .circumflexgrave .circumflexhook .circumflextilde .commaaccent .curl .diaeresis
791 -- .diaeresisacute .diaeresiscaron .diaeresisgrave .diaeresismacron .dotaccent
792 -- .dotaccentmacron .dotbelow .dotmiddle .doublegrave .grave .horn .hornacute
793 -- .horndotbelow .horngrave .hornhook .horntilde .hungarumlaut .invertedbreve
794 -- .macron .ogonek .ogonekmacron .ring .ringacute .sharp .stroke .strokeacute .tail
795 -- .tilde .tildemacron
796
797    local pattern = P("greek") + P("arabic") + P("hebrew") + P("cyrillic") + P("roman")
798
799 -- Only keep these:
800 --
801 -- copyright dotlessI dotlessi dotlessJ dotlessj emdash endash exclamdown figuredash
802 -- guilsingleleft guilsingleright periodcentered perthousand questiondown quotedbl
803 -- quotedblbase quotedblleft quotedblright quoteleft quoteright quotesingle
804 -- quotesinglebase registered rightguillemot sectionmark textacute textampersand
805 -- textAngstrom textasciicircum textasciitilde textat textbackslash textbar
806 -- textbottomcomma textbottomdot textbraceleft textbraceright textbreve
807 -- textbrokenbar textbullet textcaron textcedilla textcelsius textcent textcircledP
808 -- textcircumflex textcomma textcontrolspace textcurrency textdag textddag
809 -- textdegree textdiaeresis textdiv textdollar textdong textdotaccent textellipsis
810 -- texteuro textfraction textgrave texthash texthorizontalbar texthungarumlaut
811 -- texthyphen textkelvin textlognot textmacron textmho textminus textmp textmu
812 -- textmultiply textnumero textogonek textohm textounce textpercent textperiod
813 -- textpm textring textslash textsterling texttilde textunderscore textyen
814
815    local function setname(category,chr,u,contextname)
816        if lpegmatch(pattern,contextname) then
817         -- print("ignored",contextname)
818        elseif is_character[category] then
819            if chr.unicodeslot < 128 then
820                if is_letter[category] then
821                    texsetmacro(contextname,utfchar(u),"immutable")
822                else
823                    texsetchar(contextname,u,"immutable")
824                end
825            else
826                texsetmacro(contextname,utfchar(u),"immutable")
827            end
828        elseif is_command[category] and not forbidden[u] then
829            texsetmacro(contextname,utfchar(u),"immutable")
830        end
831    end
832
833    function characters.setcharacternames()
834        for u, chr in next, data do -- will move up
835            local contextname = chr.contextname
836            local contextspec = chr.contextspec
837            local category    = chr.category
838            if contextname then
839                setname(category,chr,u,contextname)
840            end
841            if contextspec then
842                for i=1,#contextspec do
843                    local extraname = contextspec[i]
844                    if extraname ~= contextname then
845                        setname(category,chr,u,extraname)
846                    end
847                end
848            end
849        end
850    end
851
852else
853
854    mark(csletters)
855
856    local pattern = P("greek") + P("arabic") + P("hebrew") + P("cyrillic") + P("roman")
857
858    function characters.addcharacternames() -- we could be granular e.g. 'greek'
859        for u, chr in next, data do -- will move up
860            local contextname = chr.contextname
861            if contextname and lpegmatch(pattern,contextname) then
862                local category = chr.category
863                if is_character[category] or is_command[category] then
864                    texsetmacro(contextname,utfchar(u),"immutable")
865                end
866            end
867        end
868        function characters.addcharacternames() end
869    end
870
871end
872
873lpegpatterns.csletter = utfchartabletopattern(csletters)
874
875-- The engine presets the letters to 11 (always).
876
877function characters.setlettercatcodes(cct)
878    if trace_defining then
879        report_defining("assigning letter catcodes to catcode table %a",cct)
880    end
881    local saved = tex.catcodetable
882    tex.catcodetable = cct
883    texsetcatcode(0x200C,11) -- non-joiner
884    texsetcatcode(0x200D,11) -- joiner
885    for c, u in next, csletters do
886        texsetcatcode(u,11)
887    end
888    tex.catcodetable = saved
889end
890
891function characters.setothercatcodes(cct)
892    if trace_defining then
893        report_defining("assigning other catcodes to catcode table %a",cct)
894    end
895    local saved = tex.catcodetable
896    tex.catcodetable = cct
897    for u=65,90 do
898        texsetcatcode(u,12)
899    end
900    for u=97,122 do
901        texsetcatcode(u,12)
902    end
903    tex.catcodetable = saved
904end
905
906function characters.setactivecatcodes(cct)
907    local saved = tex.catcodetable
908    tex.catcodetable = cct
909    for i=1,#activated do
910        local u = activated[i]
911        texsetcatcode(u,13)
912        if trace_defining then
913            report_defining("character %U (%s) is active in set %a",u,data[u].description,cct)
914        end
915    end
916    tex.catcodetable = saved
917end
918
919implement {
920    name      = "chardescription",
921    arguments = "integer",
922    actions   = function(slot)
923        local d = data[slot]
924        if d then
925            context(d.description)
926        end
927    end,
928}
929
930if characters.setcharacternames then -- only in ini mode
931
932    implement { name = "setlettercatcodes", scope = "private", actions = characters.setlettercatcodes, arguments = "integer" }
933    implement { name = "setothercatcodes",  scope = "private", actions = characters.setothercatcodes,  arguments = "integer" }
934    implement { name = "setactivecatcodes", scope = "private", actions = characters.setactivecatcodes, arguments = "integer" }
935    implement { name = "setcharacternames", scope = "private", actions = characters.setcharacternames, arguments = "integer" }
936
937end
938
939-- experiment (some can move to char-ini.lua)
940
941local function overload(c,u,code,codes)
942    local c = tonumber(c)
943    if not c then
944        return
945    end
946    local u = utilities.parsers.settings_to_array(u)
947    local n = #u
948    if n == 0 then
949        return
950    end
951    local t = nil
952    if n == 1 then
953        t = tonumber(u[1])
954    else
955        t = { }
956        for i=1,n do
957            t[#t+1] = tonumber(u[i])
958        end
959    end
960    if t then
961        data[c][code] = t
962        characters[codes][c] = nil
963    end
964end
965
966implement {
967    name      = "overloaduppercase",
968    arguments = "2 strings",
969    actions   = function(c,u)
970        overload(c,u,"uccode","uccodes")
971    end
972}
973
974implement {
975    name      = "overloadlowercase",
976    arguments = "2 strings",
977    actions   = function(c,u)
978        overload(c,u,"lccode","lccodes")
979    end
980}
981
982-- Just for fun we support keywords:
983--
984-- \startTEXpage[offset=10pt]
985--     abg"
986--     \sl \showboxes
987--     \accent               `" h%
988--     \accent               `" x%
989--     \accent yoffset  .2ex `" x
990--     \accent yoffset 1.1ex `x x%
991-- \stopTEXpage
992--
993-- We could do this:
994--
995-- \startTEXpage[offset=10pt]
996--     abg"
997--     \sl \showboxes
998--     \withaccent               `" h%
999--     \withaccent               `" x%
1000--     \withaccent yoffset  .2ex `" x
1001--     \withaccent yoffset 1.1ex accent `x base `x%
1002-- \stopTEXpage
1003--
1004-- But only when users demand it:
1005--
1006-- do
1007--
1008--     local new_glyph = nodes.pool.glyph
1009--
1010--     local scankeyword   = tokens.scanners.keyword
1011--     local scaninteger   = tokens.scanners.integer
1012--     local scandimension = tokens.scanners.dimension
1013--     local scantoken     = tokens.scanners.token
1014--
1015--     implement {
1016--         name      = "withaccent",
1017--         public    = true,
1018--         protected = true,
1019--         actions   = function()
1020--             local xoffset = 0
1021--             local yoffset = 0
1022--             local accent  = false
1023--             local base    = false
1024--             local zwj     = 0x200D
1025--             while true do
1026--                 if scankeyword("xoffset") then
1027--                     xoffset = scandimension()
1028--                 elseif scankeyword("yoffset") then
1029--                     yoffset = scandimension()
1030--                 elseif scankeyword("accent") then
1031--                     accent = scaninteger()
1032--                 elseif scankeyword("base") then
1033--                     base = scaninteger()
1034--                 else
1035--                     break
1036--                 end
1037--             end
1038--             if not accent then
1039--                 accent = scaninteger()
1040--             end
1041--             if not base then
1042--                 local nxttok = scantoken()
1043--                 base = nxttok.cmdname == "char_number" and scaninteger() or nxttok.index
1044--             end
1045--             if base and accent and base > 0 and accent > 0 then
1046--                 base   = new_glyph(true,base)
1047--                 zwj    = new_glyph(true,zwj)
1048--                 accent = new_glyph(true,accent)
1049--                 local slant   = fonts.hashes.parameters[true].slant / 65536 -- a la tex
1050--                 local xheight = fonts.hashes.parameters[true].xheight -- hm, compensated for glyphscale?
1051--                 accent.xoffset = xoffset - .5*(base.width -accent.width) + .5*(base.height-accent.height) * slant
1052--                 accent.yoffset = yoffset - (xheight - accent.height)
1053--                 accent.left    = accent.width
1054--                 accent.options = accent.options | 0x40 | 0x80
1055--                 context.dontleavehmode()
1056--                 context(base)
1057--                 context(zwj)
1058--                 context(accent)
1059--             end
1060--         end,
1061--     }
1062--
1063-- end
1064
Source Browser ?