char-tex.lua /size: 26 Kb    last modification: 2020-07-01 14:35
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
char-tex
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to char-ini.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
local
lpeg
=
lpeg
10
local
tonumber
,
next
,
type
=
tonumber
,
next
,
type
11
local
format
,
find
,
gmatch
,
match
=
string
.
format
,
string
.
find
,
string
.
gmatch
,
string
.
match
12
local
utfchar
,
utfbyte
=
utf
.
char
,
utf
.
byte
13
local
concat
,
tohash
=
table
.
concat
,
table
.
tohash
14
local
P
,
C
,
R
,
S
,
V
,
Cs
,
Cc
=
lpeg
.
P
,
lpeg
.
C
,
lpeg
.
R
,
lpeg
.
S
,
lpeg
.
V
,
lpeg
.
Cs
,
lpeg
.
Cc
15 16
local
lpegpatterns
=
lpeg
.
patterns
17
local
lpegmatch
=
lpeg
.
match
18
local
utfchartabletopattern
=
lpeg
.
utfchartabletopattern
19 20
local
allocate
=
utilities
.
storage
.
allocate
21
local
mark
=
utilities
.
storage
.
mark
22 23
local
context
=
context
24
local
commands
=
commands
25 26
local
characters
=
characters
27
local
texcharacters
=
{
}
28
characters
.
tex
=
texcharacters
29
local
utffilters
=
characters
.
filters
.
utf
30 31
local
is_character
=
characters
.
is_character
32
local
is_letter
=
characters
.
is_letter
33
local
is_command
=
characters
.
is_command
34
local
is_spacing
=
characters
.
is_spacing
35
local
is_mark
=
characters
.
is_mark
36
local
is_punctuation
=
characters
.
is_punctuation
37 38
local
data
=
characters
.
data
if
not
data
then
return
end
39
local
blocks
=
characters
.
blocks
40 41
local
trace_defining
=
false
trackers
.
register
(
"
characters.defining
"
,
function
(
v
)
characters_defining
=
v
end
)
42 43
local
report_defining
=
logs
.
reporter
(
"
characters
"
)
44 45
--[[ldx-- 46<p>In order to deal with 8-bit output, we need to find a way to go from <l n='utf'/> to 478-bit. This is handled in the <l n='luatex'/> engine itself.</p> 48 49<p>This leaves us problems with characters that are specific to <l n='tex'/> like 50<type>{}</type>, <type>$</type> and alike. We can remap some chars that tex input files 51are sensitive for to a private area (while writing to a utility file) and revert then 52to their original slot when we read in such a file. Instead of reverting, we can (when 53we resolve characters to glyphs) map them to their right glyph there. For this purpose 54we can use the private planes 0x0F0000 and 0x100000.</p> 55--ldx]]
--
56 57
local
low
=
allocate
(
)
58
local
high
=
allocate
(
)
59
local
escapes
=
allocate
(
)
60
local
special
=
"
~#$%^&_{}\\|
"
-- "~#$%{}\\|"
61 62
local
private
=
{
63
low
=
low
,
64
high
=
high
,
65
escapes
=
escapes
,
66
}
67 68
utffilters
.
private
=
private
69 70
for
ch
in
gmatch
(
special
,
"
.
"
)
do
71
local
cb
72
if
type
(
ch
)
=
=
"
number
"
then
73
cb
,
ch
=
ch
,
utfchar
(
ch
)
74
else
75
cb
=
utfbyte
(
ch
)
76
end
77
if
cb
<
256
then
78
escapes
[
ch
]
=
"
\\
"
.
.
ch
79
low
[
ch
]
=
utfchar
(
0x0F0000
+
cb
)
80
if
ch
=
=
"
%
"
then
81
ch
=
"
%%
"
-- nasty, but we need this as in replacements (also in lpeg) % is interpreted
82
end
83
high
[
utfchar
(
0x0F0000
+
cb
)
]
=
ch
84
end
85
end
86 87
local
tohigh
=
lpeg
.
replacer
(
low
)
-- frozen, only for basic tex
88
local
tolow
=
lpeg
.
replacer
(
high
)
-- frozen, only for basic tex
89 90
lpegpatterns
.
utftohigh
=
tohigh
91
lpegpatterns
.
utftolow
=
tolow
92 93
function
utffilters
.
harden
(
str
)
94
return
lpegmatch
(
tohigh
,
str
)
95
end
96 97
function
utffilters
.
soften
(
str
)
98
return
lpegmatch
(
tolow
,
str
)
99
end
100 101
private
.
escape
=
utf
.
remapper
(
escapes
)
-- maybe: ,"dynamic"
102
private
.
replace
=
utf
.
remapper
(
low
)
-- maybe: ,"dynamic"
103
private
.
revert
=
utf
.
remapper
(
high
)
-- maybe: ,"dynamic"
104 105
--[[ldx-- 106<p>We get a more efficient variant of this when we integrate 107replacements in collapser. This more or less renders the previous 108private code redundant. The following code is equivalent but the 109first snippet uses the relocated dollars.</p> 110 111<typing> 112[󰀤x󰀤] [$x$] 113</typing> 114--ldx]]
--
115 116
-- using the tree-lpeg-mapper would be nice but we also need to deal with end-of-string
117
-- cases: "\"\i" and don't want "\relax" to be seen as \r e lax" (for which we need to mess
118
-- with spaces
119 120
local
accentmapping
=
allocate
{
121
[
'
"
'
]
=
{
[
"
"
]
=
"
¨
"
,
122
A
=
"
Ä
"
,
a
=
"
ä
"
,
123
E
=
"
Ë
"
,
e
=
"
ë
"
,
124
I
=
"
Ï
"
,
i
=
"
ï
"
,
[
"
ı
"
]
=
"
ï
"
,
[
"
\\i
"
]
=
"
ï
"
,
125
O
=
"
Ö
"
,
o
=
"
ö
"
,
126
U
=
"
Ü
"
,
u
=
"
ü
"
,
127
Y
=
"
Ÿ
"
,
y
=
"
ÿ
"
,
128
}
,
129
[
"
'
"
]
=
{
[
"
"
]
=
"
´
"
,
130
A
=
"
Á
"
,
a
=
"
á
"
,
131
C
=
"
Ć
"
,
c
=
"
ć
"
,
132
E
=
"
É
"
,
e
=
"
é
"
,
133
I
=
"
Í
"
,
i
=
"
í
"
,
[
"
ı
"
]
=
"
í
"
,
[
"
\\i
"
]
=
"
í
"
,
134
L
=
"
Ĺ
"
,
l
=
"
ĺ
"
,
135
N
=
"
Ń
"
,
n
=
"
ń
"
,
136
O
=
"
Ó
"
,
o
=
"
ó
"
,
137
R
=
"
Ŕ
"
,
r
=
"
ŕ
"
,
138
S
=
"
Ś
"
,
s
=
"
ś
"
,
139
U
=
"
Ú
"
,
u
=
"
ú
"
,
140
Y
=
"
Ý
"
,
y
=
"
ý
"
,
141
Z
=
"
Ź
"
,
z
=
"
ź
"
,
142
}
,
143
[
"
.
"
]
=
{
[
"
"
]
=
"
˙
"
,
144
C
=
"
Ċ
"
,
c
=
"
ċ
"
,
145
E
=
"
Ė
"
,
e
=
"
ė
"
,
146
G
=
"
Ġ
"
,
g
=
"
ġ
"
,
147
I
=
"
İ
"
,
i
=
"
i
"
,
[
"
ı
"
]
=
"
i
"
,
[
"
\\i
"
]
=
"
i
"
,
148
Z
=
"
Ż
"
,
z
=
"
ż
"
,
149
}
,
150
[
"
=
"
]
=
{
[
"
"
]
=
"
¯
"
,
151
A
=
"
Ā
"
,
a
=
"
ā
"
,
152
E
=
"
Ē
"
,
e
=
"
ē
"
,
153
I
=
"
Ī
"
,
i
=
"
ī
"
,
[
"
ı
"
]
=
"
ī
"
,
[
"
\\i
"
]
=
"
ī
"
,
154
O
=
"
Ō
"
,
o
=
"
ō
"
,
155
U
=
"
Ū
"
,
u
=
"
ū
"
,
156
}
,
157
[
"
H
"
]
=
{
[
"
"
]
=
"
˝
"
,
158
O
=
"
Ő
"
,
o
=
"
ő
"
,
159
U
=
"
Ű
"
,
u
=
"
ű
"
,
160
}
,
161
[
"
^
"
]
=
{
[
"
"
]
=
"
ˆ
"
,
162
A
=
"
Â
"
,
a
=
"
â
"
,
163
C
=
"
Ĉ
"
,
c
=
"
ĉ
"
,
164
E
=
"
Ê
"
,
e
=
"
ê
"
,
165
G
=
"
Ĝ
"
,
g
=
"
ĝ
"
,
166
H
=
"
Ĥ
"
,
h
=
"
ĥ
"
,
167
I
=
"
Î
"
,
i
=
"
î
"
,
[
"
ı
"
]
=
"
î
"
,
[
"
\\i
"
]
=
"
î
"
,
168
J
=
"
Ĵ
"
,
j
=
"
ĵ
"
,
169
O
=
"
Ô
"
,
o
=
"
ô
"
,
170
S
=
"
Ŝ
"
,
s
=
"
ŝ
"
,
171
U
=
"
Û
"
,
u
=
"
û
"
,
172
W
=
"
Ŵ
"
,
w
=
"
ŵ
"
,
173
Y
=
"
Ŷ
"
,
y
=
"
ŷ
"
,
174
}
,
175
[
"
`
"
]
=
{
[
"
"
]
=
"
`
"
,
176
A
=
"
À
"
,
a
=
"
à
"
,
177
E
=
"
È
"
,
e
=
"
è
"
,
178
I
=
"
Ì
"
,
i
=
"
ì
"
,
[
"
ı
"
]
=
"
ì
"
,
[
"
\\i
"
]
=
"
ì
"
,
179
O
=
"
Ò
"
,
o
=
"
ò
"
,
180
U
=
"
Ù
"
,
u
=
"
ù
"
,
181
Y
=
"
"
,
y
=
"
"
,
182
}
,
183
[
"
c
"
]
=
{
[
"
"
]
=
"
¸
"
,
184
C
=
"
Ç
"
,
c
=
"
ç
"
,
185
K
=
"
Ķ
"
,
k
=
"
ķ
"
,
186
L
=
"
Ļ
"
,
l
=
"
ļ
"
,
187
N
=
"
Ņ
"
,
n
=
"
ņ
"
,
188
R
=
"
Ŗ
"
,
r
=
"
ŗ
"
,
189
S
=
"
Ş
"
,
s
=
"
ş
"
,
190
T
=
"
Ţ
"
,
t
=
"
ţ
"
,
191
}
,
192
[
"
k
"
]
=
{
[
"
"
]
=
"
˛
"
,
193
A
=
"
Ą
"
,
a
=
"
ą
"
,
194
E
=
"
Ę
"
,
e
=
"
ę
"
,
195
I
=
"
Į
"
,
i
=
"
į
"
,
196
U
=
"
Ų
"
,
u
=
"
ų
"
,
197
}
,
198
[
"
r
"
]
=
{
[
"
"
]
=
"
˚
"
,
199
A
=
"
Å
"
,
a
=
"
å
"
,
200
U
=
"
Ů
"
,
u
=
"
ů
"
,
201
}
,
202
[
"
u
"
]
=
{
[
"
"
]
=
"
˘
"
,
203
A
=
"
Ă
"
,
a
=
"
ă
"
,
204
E
=
"
Ĕ
"
,
e
=
"
ĕ
"
,
205
G
=
"
Ğ
"
,
g
=
"
ğ
"
,
206
I
=
"
Ĭ
"
,
i
=
"
ĭ
"
,
[
"
ı
"
]
=
"
ĭ
"
,
[
"
\\i
"
]
=
"
ĭ
"
,
207
O
=
"
Ŏ
"
,
o
=
"
ŏ
"
,
208
U
=
"
Ŭ
"
,
u
=
"
ŭ
"
,
209
}
,
210
[
"
v
"
]
=
{
[
"
"
]
=
"
ˇ
"
,
211
C
=
"
Č
"
,
c
=
"
č
"
,
212
D
=
"
Ď
"
,
d
=
"
ď
"
,
213
E
=
"
Ě
"
,
e
=
"
ě
"
,
214
L
=
"
Ľ
"
,
l
=
"
ľ
"
,
215
N
=
"
Ň
"
,
n
=
"
ň
"
,
216
R
=
"
Ř
"
,
r
=
"
ř
"
,
217
S
=
"
Š
"
,
s
=
"
š
"
,
218
T
=
"
Ť
"
,
t
=
"
ť
"
,
219
Z
=
"
Ž
"
,
z
=
"
ž
"
,
220
}
,
221
[
"
~
"
]
=
{
[
"
"
]
=
"
˜
"
,
222
A
=
"
Ã
"
,
a
=
"
ã
"
,
223
I
=
"
Ĩ
"
,
i
=
"
ĩ
"
,
[
"
ı
"
]
=
"
ĩ
"
,
[
"
\\i
"
]
=
"
ĩ
"
,
224
N
=
"
Ñ
"
,
n
=
"
ñ
"
,
225
O
=
"
Õ
"
,
o
=
"
õ
"
,
226
U
=
"
Ũ
"
,
u
=
"
ũ
"
,
227
}
,
228
}
229 230
texcharacters
.
accentmapping
=
accentmapping
231 232
local
accent_map
=
allocate
{
-- incomplete
233
[
'
~
'
]
=
"
̃
"
,
-- ̃ Ẽ
234
[
'
"
'
]
=
"
̈
"
,
-- ̈ Ë
235
[
"
`
"
]
=
"
̀
"
,
-- ̀ È
236
[
"
'
"
]
=
"
́
"
,
-- ́ É
237
[
"
^
"
]
=
"
̂
"
,
-- ̂ Ê
238
-- ̄ Ē
239
-- ̆ Ĕ
240
-- ̇ Ė
241
-- ̉ Ẻ
242
-- ̌ Ě
243
-- ̏ Ȅ
244
-- ̑ Ȇ
245
-- ̣ Ẹ
246
-- ̧ Ȩ
247
-- ̨ Ę
248
-- ̭ Ḙ
249
-- ̰ Ḛ
250
}
251 252
-- local accents = concat(table.keys(accentmapping)) -- was _map
253 254
local
function
remap_accent
(
a
,
c
,
braced
)
255
local
m
=
accentmapping
[
a
]
256
if
m
then
257
local
n
=
m
[
c
]
258
if
n
then
259
return
n
260
end
261
end
262
-- local m = accent_map[a]
263
-- if m then
264
-- return c .. m
265
-- elseif braced then -- or #c > 0
266
if
braced
then
-- or #c > 0
267
return
"
\\
"
.
.
a
.
.
"
{
"
.
.
c
.
.
"
}
"
268
else
269
return
"
\\
"
.
.
a
.
.
"
"
.
.
c
270
end
271
end
272 273
local
commandmapping
=
allocate
{
274
[
"
aa
"
]
=
"
å
"
,
[
"
AA
"
]
=
"
"
,
275
[
"
ae
"
]
=
"
æ
"
,
[
"
AE
"
]
=
"
Æ
"
,
276
[
"
cc
"
]
=
"
ç
"
,
[
"
CC
"
]
=
"
Ç
"
,
277
[
"
i
"
]
=
"
ı
"
,
[
"
j
"
]
=
"
ȷ
"
,
278
[
"
ij
"
]
=
"
ij
"
,
[
"
IJ
"
]
=
"
IJ
"
,
279
[
"
l
"
]
=
"
ł
"
,
[
"
L
"
]
=
"
Ł
"
,
280
[
"
o
"
]
=
"
ø
"
,
[
"
O
"
]
=
"
Ø
"
,
281
[
"
oe
"
]
=
"
œ
"
,
[
"
OE
"
]
=
"
Œ
"
,
282
[
"
sz
"
]
=
"
ß
"
,
[
"
SZ
"
]
=
"
SZ
"
,
[
"
ss
"
]
=
"
ß
"
,
[
"
SS
"
]
=
"
ß
"
,
-- uppercase: ẞ
283
}
284 285
texcharacters
.
commandmapping
=
commandmapping
286 287
local
ligaturemapping
=
allocate
{
288
[
"
''
"
]
=
"
"
,
289
[
"
``
"
]
=
"
"
,
290
[
"
--
"
]
=
"
"
,
291
[
"
---
"
]
=
"
"
,
292
}
293 294
-- local achar = R("az","AZ") + P("ı") + P("\\i")
295
--
296
-- local spaces = P(" ")^0
297
-- local no_l = P("{") / ""
298
-- local no_r = P("}") / ""
299
-- local no_b = P('\\') / ""
300
--
301
-- local lUr = P("{") * C(achar) * P("}")
302
--
303
-- local accents_1 = [["'.=^`~]]
304
-- local accents_2 = [[Hckruv]]
305
--
306
-- local accent = P('\\') * (
307
-- C(S(accents_1)) * (lUr * Cc(true) + C(achar) * Cc(false)) + -- we need achar for ı etc, could be sped up
308
-- C(S(accents_2)) * lUr * Cc(true)
309
-- ) / remap_accent
310
--
311
-- local csname = P('\\') * C(R("az","AZ")^1)
312
--
313
-- local command = (
314
-- csname +
315
-- P("{") * csname * spaces * P("}")
316
-- ) / commandmapping -- remap_commands
317
--
318
-- local both_1 = Cs { "run",
319
-- accent = accent,
320
-- command = command,
321
-- run = (V("accent") + no_l * V("accent") * no_r + V("command") + P(1))^0,
322
-- }
323
--
324
-- local both_2 = Cs { "run",
325
-- accent = accent,
326
-- command = command,
327
-- run = (V("accent") + V("command") + no_l * ( V("accent") + V("command") ) * no_r + P(1))^0,
328
-- }
329
--
330
-- function texcharacters.toutf(str,strip)
331
-- if not find(str,"\\",1,true) then
332
-- return str
333
-- elseif strip then
334
-- return lpegmatch(both_1,str)
335
-- else
336
-- return lpegmatch(both_2,str)
337
-- end
338
-- end
339 340
local
untex
341 342
local
function
toutfpattern
(
)
343
if
not
untex
then
344
local
hash
=
{
}
345
for
k
,
v
in
next
,
accentmapping
do
346
for
kk
,
vv
in
next
,
v
do
347
if
(
k
>
=
"
a
"
and
k
<
=
"
z
"
)
or
(
k
>
=
"
A
"
and
k
<
=
"
Z
"
)
then
348
hash
[
"
\\
"
.
.
k
.
.
"
"
.
.
kk
]
=
vv
349
hash
[
"
{\\
"
.
.
k
.
.
"
"
.
.
kk
.
.
"
}
"
]
=
vv
350
else
351
hash
[
"
\\
"
.
.
k
.
.
kk
]
=
vv
352
hash
[
"
{\\
"
.
.
k
.
.
kk
.
.
"
}
"
]
=
vv
353
end
354
hash
[
"
\\
"
.
.
k
.
.
"
{
"
.
.
kk
.
.
"
}
"
]
=
vv
355
hash
[
"
{\\
"
.
.
k
.
.
"
{
"
.
.
kk
.
.
"
}}
"
]
=
vv
356
end
357
end
358
for
k
,
v
in
next
,
commandmapping
do
359
hash
[
"
\\
"
.
.
k
.
.
"
"
]
=
v
360
hash
[
"
{\\
"
.
.
k
.
.
"
}
"
]
=
v
361
hash
[
"
{\\
"
.
.
k
.
.
"
}
"
]
=
v
362
end
363
for
k
,
v
in
next
,
ligaturemapping
do
364
hash
[
k
]
=
v
365
end
366
untex
=
utfchartabletopattern
(
hash
)
/
hash
367
end
368
return
untex
369
end
370 371
texcharacters
.
toutfpattern
=
toutfpattern
372 373
local
pattern
=
nil
374 375
local
function
prepare
(
)
376
pattern
=
Cs
(
(
toutfpattern
(
)
+
P
(
1
)
)
^
0
)
377
return
pattern
378
end
379 380
function
texcharacters
.
toutf
(
str
,
strip
)
381
if
str
=
=
"
"
then
382
return
str
383
elseif
not
find
(
str
,
"
\\
"
,
1
,
true
)
then
384
return
str
385
-- elseif strip then
386
else
387
return
lpegmatch
(
pattern
or
prepare
(
)
,
str
)
388
end
389
end
390 391
-- print(texcharacters.toutf([[\~{Z}]],true))
392
-- print(texcharacters.toutf([[\'\i]],true))
393
-- print(texcharacters.toutf([[\'{\i}]],true))
394
-- print(texcharacters.toutf([[\"{e}]],true))
395
-- print(texcharacters.toutf([[\" {e}]],true))
396
-- print(texcharacters.toutf([[{\"{e}}]],true))
397
-- print(texcharacters.toutf([[{\" {e}}]],true))
398
-- print(texcharacters.toutf([[{\l}]],true))
399
-- print(texcharacters.toutf([[{\l }]],true))
400
-- print(texcharacters.toutf([[\v{r}]],true))
401
-- print(texcharacters.toutf([[fo{\"o}{\ss}ar]],true))
402
-- print(texcharacters.toutf([[H{\'a}n Th\^e\llap{\raise 0.5ex\hbox{\'{\relax}}} Th{\'a}nh]],true))
403 404
function
texcharacters
.
safechar
(
n
)
-- was characters.safechar
405
local
c
=
data
[
n
]
406
if
c
and
c
.
contextname
then
407
return
"
\\
"
.
.
c
.
contextname
408
else
409
return
utfchar
(
n
)
410
end
411
end
412 413
if
not
context
or
not
commands
then
414
-- used in e.g. mtx-bibtex
415
return
416
end
417 418
-- all kind of initializations
419 420
if
not
interfaces
then
return
end
421 422
local
implement
=
interfaces
.
implement
423 424
local
tex
=
tex
425
local
texsetlccode
=
tex
.
setlccode
426
local
texsetsfcode
=
tex
.
setsfcode
427
local
texsetcatcode
=
tex
.
setcatcode
428 429
local
contextsprint
=
context
.
sprint
430
local
ctxcatcodes
=
catcodes
.
numbers
.
ctxcatcodes
431 432
local
texsetmacro
=
tokens
.
setters
.
macro
433
local
texsetchar
=
tokens
.
setters
.
char
434 435
function
texcharacters
.
defineaccents
(
)
436
local
ctx_dodefineaccentcommand
=
context
.
dodefineaccentcommand
437
local
ctx_dodefineaccent
=
context
.
dodefineaccent
438
local
ctx_dodefinecommand
=
context
.
dodefinecommand
439
for
accent
,
group
in
next
,
accentmapping
do
440
ctx_dodefineaccentcommand
(
accent
)
441
for
character
,
mapping
in
next
,
group
do
442
ctx_dodefineaccent
(
accent
,
character
,
mapping
)
443
end
444
end
445
for
command
,
mapping
in
next
,
commandmapping
do
446
ctx_dodefinecommand
(
command
,
mapping
)
447
end
448
end
449 450
implement
{
-- a waste of scanner but consistent
451
name
=
"
defineaccents
"
,
452
actions
=
texcharacters
.
defineaccents
453
}
454 455
--[[ldx-- 456<p>Instead of using a <l n='tex'/> file to define the named glyphs, we 457use the table. After all, we have this information available anyway.</p> 458--ldx]]
--
459 460
function
commands
.
makeactive
(
n
,
name
)
-- not used
461
contextsprint
(
ctxcatcodes
,
format
(
"
\\catcode%s=13\\unexpanded\\def %s{\\%s}
"
,
n
,
utfchar
(
n
)
,
name
)
)
462
-- context("\\catcode%s=13\\unexpanded\\def %s{\\%s}",n,utfchar(n),name)
463
end
464 465
local
function
to_number
(
s
)
466
local
n
=
tonumber
(
s
)
467
if
n
then
468
return
n
469
end
470
return
tonumber
(
match
(
s
,
'
^"(.*)$
'
)
,
16
)
or
0
471
end
472 473
implement
{
474
name
=
"
utfchar
"
,
475
actions
=
{
to_number
,
utfchar
,
contextsprint
}
,
476
arguments
=
"
string
"
477
}
478 479
implement
{
480
name
=
"
safechar
"
,
481
actions
=
{
to_number
,
texcharacters
.
safechar
,
contextsprint
}
,
482
arguments
=
"
string
"
483
}
484 485
implement
{
486
name
=
"
uchar
"
,
487
arguments
=
{
"
integer
"
,
"
integer
"
}
,
488
actions
=
function
(
h
,
l
)
489
context
(
utfchar
(
h
*
256
+
l
)
)
490
end
491
}
492 493
tex
.
uprint
=
commands
.
utfchar
494 495
-- in contect we don't use lc and uc codes (in fact in luatex we should have a hf code)
496
-- so at some point we might drop this
497 498
-- The following get set at the TeX end:
499 500
local
forbidden
=
tohash
{
501
0x000A0
,
-- zs nobreakspace <self>
502
0x000AD
,
-- cf softhyphen <self>
503
-- 0x00600, -- cf arabicnumber <self>
504
-- 0x00601, -- cf arabicsanah <self>
505
-- 0x00602, -- cf arabicfootnotemarker <self>
506
-- 0x00603, -- cf arabicsafha <self>
507
-- 0x00604, -- cf arabicsamvat <self>
508
-- 0x00605, -- cf arabicnumberabove <self>
509
-- 0x0061C, -- cf arabiclettermark <self>
510
-- 0x006DD, -- cf arabicendofayah <self>
511
-- 0x008E2, -- cf arabicdisputedendofayah <self>
512
0x02000
,
-- zs enquad <self>
513
0x02001
,
-- zs emquad <self>
514
0x02002
,
-- zs enspace \kern .5\emwidth
515
0x02003
,
-- zs emspace \hskip \emwidth
516
0x02004
,
-- zs threeperemspace <self>
517
0x02005
,
-- zs fourperemspace <self>
518
0x02006
,
-- zs sixperemspace <self>
519
0x02007
,
-- zs figurespace <self>
520
0x02008
,
-- zs punctuationspace <self>
521
0x02009
,
-- zs breakablethinspace <self>
522
0x0200A
,
-- zs hairspace <self>
523
0x0200B
,
-- cf zerowidthspace <self>
524
0x0200C
,
-- cf zwnj <self>
525
0x0200D
,
-- cf zwj <self>
526
0x0202F
,
-- zs narrownobreakspace <self>
527
0x0205F
,
-- zs medspace \textormathspace +\medmuskip 2
528
-- 0x03000, -- zs ideographicspace <self>
529
-- 0x0FEFF, -- cf zerowidthnobreakspace \penalty \plustenthousand \kern \zeropoint
530
}
531 532
local
csletters
=
characters
.
csletters
-- also a signal that we have initialized
533
local
activated
=
{
}
534
local
sfmode
=
"
unset
"
-- unset, traditional, normal
535
local
block_too
=
false
536 537
directives
.
register
(
"
characters.blockstoo
"
,
function
(
v
)
block_too
=
v
end
)
538 539
-- If this is something that is not documentwide and used a lot, then we
540
-- need a more clever approach (trivial but not now).
541 542
local
function
setuppersfcodes
(
v
,
n
)
543
if
sfstate
~
=
"
unset
"
then
544
report_defining
(
"
setting uppercase sf codes to %a
"
,
n
)
545
for
u
,
chr
in
next
,
data
do
546
if
chr
.
category
=
=
"
lu
"
then
547
texsetsfcode
(
u
,
n
)
548
end
549
end
550
end
551
sfstate
=
v
552
end
553 554
directives
.
register
(
"
characters.spaceafteruppercase
"
,
function
(
v
)
555
if
v
=
=
"
traditional
"
then
556
setuppersfcodes
(
v
,
999
)
557
elseif
v
=
=
"
normal
"
then
558
setuppersfcodes
(
v
,
1000
)
559
end
560
end
)
561 562
if
not
csletters
then
563 564
csletters
=
allocate
(
)
565
characters
.
csletters
=
csletters
566 567
report_defining
(
"
setting up character related codes and commands
"
)
568 569
if
sfstate
=
=
"
unset
"
then
570
sfstate
=
"
traditional
"
571
end
572 573
local
traditional
=
sfstate
=
=
"
traditional
"
574 575
for
u
,
chr
in
next
,
data
do
-- will move up
576
local
fallback
=
chr
.
fallback
577
if
fallback
then
578
contextsprint
(
"
{\\catcode
"
,
u
,
"
=13\\unexpanded\\gdef
"
,
utfchar
(
u
)
,
"
{\\checkedchar{
"
,
u
,
"
}{
"
,
fallback
,
"
}}}
"
)
579
activated
[
#
activated
+
1
]
=
u
580
else
581
local
contextname
=
chr
.
contextname
582
local
category
=
chr
.
category
583
local
isletter
=
is_letter
[
category
]
584
if
contextname
then
585
if
is_character
[
category
]
then
586
if
chr
.
unicodeslot
<
128
then
587
if
isletter
then
588
local
c
=
utfchar
(
u
)
589
texsetmacro
(
contextname
,
c
)
590
csletters
[
c
]
=
u
591
else
592
texsetchar
(
contextname
,
u
)
593
end
594
else
595
local
c
=
utfchar
(
u
)
596
texsetmacro
(
contextname
,
c
)
597
if
isletter
and
u
>
=
32
and
u
<
=
65536
then
598
csletters
[
c
]
=
u
599
end
600
end
601
--
602
if
isletter
then
603
local
lc
=
chr
.
lccode
604
local
uc
=
chr
.
uccode
605
if
not
lc
then
606
chr
.
lccode
=
u
607
lc
=
u
608
elseif
type
(
lc
)
=
=
"
table
"
then
609
lc
=
u
610
end
611
if
not
uc
then
612
chr
.
uccode
=
u
613
uc
=
u
614
elseif
type
(
uc
)
=
=
"
table
"
then
615
uc
=
u
616
end
617
texsetlccode
(
u
,
lc
,
uc
)
618
if
traditional
and
category
=
=
"
lu
"
then
619
texsetsfcode
(
code
,
999
)
620
end
621
end
622
--
623
elseif
is_command
[
category
]
and
not
forbidden
[
u
]
then
624
-- contextsprint("{\\catcode",u,"=13\\unexpanded\\gdef ",utfchar(u),"{\\",contextname,"}}")
625
-- activated[#activated+1] = u
626
local
c
=
utfchar
(
u
)
627
texsetmacro
(
contextname
,
c
)
628
elseif
is_mark
[
category
]
then
629
texsetlccode
(
u
,
u
,
u
)
-- for hyphenation
630
end
631
-- elseif isletter and u >= 32 and u <= 65536 then
632
elseif
isletter
then
633
csletters
[
utfchar
(
u
)
]
=
u
634
--
635
local
lc
,
uc
=
chr
.
lccode
,
chr
.
uccode
636
if
not
lc
then
637
chr
.
lccode
=
u
638
lc
=
u
639
elseif
type
(
lc
)
=
=
"
table
"
then
640
lc
=
u
641
end
642
if
not
uc
then
643
chr
.
uccode
=
u
644
uc
=
u
645
elseif
type
(
uc
)
=
=
"
table
"
then
646
uc
=
u
647
end
648
texsetlccode
(
u
,
lc
,
uc
)
649
if
traditional
and
category
=
=
"
lu
"
then
650
texsetsfcode
(
code
,
999
)
651
end
652
--
653
elseif
is_mark
[
category
]
then
654
--
655
texsetlccode
(
u
,
u
,
u
)
-- for hyphenation
656
--
657
end
658
end
659
end
660 661
if
blocks_too
then
662
-- this slows down format generation by over 10 percent
663
for
k
,
v
in
next
,
blocks
do
664
if
v
.
catcode
=
=
"
letter
"
then
665
local
first
=
v
.
first
666
local
last
=
v
.
last
667
local
gaps
=
v
.
gaps
668
if
first
and
last
then
669
for
u
=
first
,
last
do
670
csletters
[
utfchar
(
u
)
]
=
u
671
--
672
-- texsetlccode(u,u,u) -- self self
673
--
674
end
675
end
676
if
gaps
then
677
for
i
=
1
,
#
gaps
do
678
local
u
=
gaps
[
i
]
679
csletters
[
utfchar
(
u
)
]
=
u
680
--
681
-- texsetlccode(u,u,u) -- self self
682
--
683
end
684
end
685
end
686
end
687
end
688 689
if
storage
then
690
storage
.
register
(
"
characters/csletters
"
,
csletters
,
"
characters.csletters
"
)
691
end
692 693
else
694
mark
(
csletters
)
695
end
696 697
lpegpatterns
.
csletter
=
utfchartabletopattern
(
csletters
)
698 699
-- todo: get rid of activated
700
-- todo: move first loop out ,merge with above
701 702
function
characters
.
setlettercatcodes
(
cct
)
703
if
trace_defining
then
704
report_defining
(
"
assigning letter catcodes to catcode table %a
"
,
cct
)
705
end
706
local
saved
=
tex
.
catcodetable
707
tex
.
catcodetable
=
cct
708
texsetcatcode
(
0x200C
,
11
)
-- non-joiner
709
texsetcatcode
(
0x200D
,
11
)
-- joiner
710
for
c
,
u
in
next
,
csletters
do
711
texsetcatcode
(
u
,
11
)
712
end
713
-- for u, chr in next, data do
714
-- if not chr.fallback and is_letter[chr.category] and u >= 32 and u <= 65536 then
715
-- texsetcatcode(u,11)
716
-- end
717
-- local range = chr.range
718
-- if range then
719
-- for i=1,range.first,range.last do -- tricky as not all are letters
720
-- texsetcatcode(i,11)
721
-- end
722
-- end
723
-- end
724
-- for k, v in next, blocks do
725
-- if v.catcode == "letter" then
726
-- for u=v.first,v.last do
727
-- texsetcatcode(u,11)
728
-- end
729
-- end
730
-- end
731
tex
.
catcodetable
=
saved
732
end
733 734
function
characters
.
setactivecatcodes
(
cct
)
735
local
saved
=
tex
.
catcodetable
736
tex
.
catcodetable
=
cct
737
for
i
=
1
,
#
activated
do
738
local
u
=
activated
[
i
]
739
texsetcatcode
(
u
,
13
)
740
if
trace_defining
then
741
report_defining
(
"
character %U (%s) is active in set %a
"
,
u
,
data
[
u
]
.
description
,
cct
)
742
end
743
end
744
tex
.
catcodetable
=
saved
745
end
746 747
--[[ldx-- 748<p>Setting the lccodes is also done in a loop over the data table.</p> 749--ldx]]
--
750 751
-- function characters.setcodes() -- we could loop over csletters
752
-- if trace_defining then
753
-- report_defining("defining lc and uc codes")
754
-- end
755
-- local traditional = sfstate == "traditional" or sfstate == "unset"
756
-- for code, chr in next, data do
757
-- local cc = chr.category
758
-- if is_letter[cc] then
759
-- local range = chr.range
760
-- if range then
761
-- for i=range.first,range.last do
762
-- texsetlccode(i,i,i) -- self self
763
-- end
764
-- else
765
-- local lc, uc = chr.lccode, chr.uccode
766
-- if not lc then
767
-- chr.lccode, lc = code, code
768
-- elseif type(lc) == "table" then
769
-- lc = code
770
-- end
771
-- if not uc then
772
-- chr.uccode, uc = code, code
773
-- elseif type(uc) == "table" then
774
-- uc = code
775
-- end
776
-- texsetlccode(code,lc,uc)
777
-- if traditional and cc == "lu" then
778
-- texsetsfcode(code,999)
779
-- end
780
-- end
781
-- elseif is_mark[cc] then
782
-- texsetlccode(code,code,code) -- for hyphenation
783
-- end
784
-- end
785
-- if traditional then
786
-- sfstate = "traditional"
787
-- end
788
-- end
789 790
-- tex
791 792
implement
{
793
name
=
"
chardescription
"
,
794
arguments
=
"
integer
"
,
795
actions
=
function
(
slot
)
796
local
d
=
data
[
slot
]
797
if
d
then
798
context
(
d
.
description
)
799
end
800
end
,
801
}
802 803
-- xml
804 805
characters
.
activeoffset
=
0x10000
-- there will be remapped in that byte range
806 807
function
commands
.
remapentity
(
chr
,
slot
)
-- not used
808
contextsprint
(
format
(
"
{\\catcode%s=13\\xdef%s{\\string%s}}
"
,
slot
,
utfchar
(
slot
)
,
chr
)
)
809
end
810 811
-- xml.entities = xml.entities or { }
812
--
813
-- storage.register("xml/entities",xml.entities,"xml.entities") -- this will move to lxml
814
--
815
-- function characters.setmkiventities()
816
-- local entities = xml.entities
817
-- entities.lt = "<"
818
-- entities.amp = "&"
819
-- entities.gt = ">"
820
-- end
821
--
822
-- function characters.setmkiientities()
823
-- local entities = xml.entities
824
-- entities.lt = utfchar(characters.activeoffset + utfbyte("<"))
825
-- entities.amp = utfchar(characters.activeoffset + utfbyte("&"))
826
-- entities.gt = utfchar(characters.activeoffset + utfbyte(">"))
827
-- end
828 829
implement
{
name
=
"
setlettercatcodes
"
,
scope
=
"
private
"
,
actions
=
characters
.
setlettercatcodes
,
arguments
=
"
integer
"
}
830
implement
{
name
=
"
setactivecatcodes
"
,
scope
=
"
private
"
,
actions
=
characters
.
setactivecatcodes
,
arguments
=
"
integer
"
}
831
--------- { name = "setcharactercodes", scope = "private", actions = characters.setcodes }
832 833
-- experiment (some can move to char-ini.lua)
834 835
local
function
overload
(
c
,
u
,
code
,
codes
)
836
local
c
=
tonumber
(
c
)
837
if
not
c
then
838
return
839
end
840
local
u
=
utilities
.
parsers
.
settings_to_array
(
u
)
841
local
n
=
#
u
842
if
n
=
=
0
then
843
return
844
end
845
local
t
=
nil
846
if
n
=
=
1
then
847
t
=
tonumber
(
u
[
1
]
)
848
else
849
t
=
{
}
850
for
i
=
1
,
n
do
851
t
[
#
t
+
1
]
=
tonumber
(
u
[
i
]
)
852
end
853
end
854
if
t
then
855
data
[
c
]
[
code
]
=
t
856
characters
[
codes
]
[
c
]
=
nil
857
end
858
end
859 860
interfaces
.
implement
{
861
name
=
"
overloaduppercase
"
,
862
arguments
=
"
2 strings
"
,
863
actions
=
function
(
c
,
u
)
864
overload
(
c
,
u
,
"
uccode
"
,
"
uccodes
"
)
865
end
866
}
867 868
interfaces
.
implement
{
869
name
=
"
overloadlowercase
"
,
870
arguments
=
"
2 strings
"
,
871
actions
=
function
(
c
,
u
)
872
overload
(
c
,
u
,
"
lccode
"
,
"
lccodes
"
)
873
end
874
}
875