char-utf.lua /size: 13 Kb    last modification: 2021-10-28 13:50
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
char-utf
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to char-utf.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
--[[ldx-- 10<p>When a sequence of <l n='utf'/> characters enters the application, it may be 11neccessary to collapse subsequences into their composed variant.</p> 12 13<p>This module implements methods for collapsing and expanding <l n='utf'/> 14sequences. We also provide means to deal with characters that are special to 15<l n='tex'/> as well as 8-bit characters that need to end up in special kinds 16of output (for instance <l n='pdf'/>).</p> 17 18<p>We implement these manipulations as filters. One can run multiple filters 19over a string.</p> 20 21<p>The old code has now been moved to char-obs.lua which we keep around for 22educational purposes.</p> 23--ldx]]
--
24 25
local
next
,
type
=
next
,
type
26
local
gsub
,
find
=
string
.
gsub
,
string
.
find
27
local
concat
,
sortedhash
,
keys
,
sort
=
table
.
concat
,
table
.
sortedhash
,
table
.
keys
,
table
.
sort
28
local
utfchar
,
utfbyte
,
utfcharacters
=
utf
.
char
,
utf
.
byte
,
utf
.
characters
29
local
P
,
Cs
,
Cmt
,
Ct
=
lpeg
.
P
,
lpeg
.
Cs
,
lpeg
.
Cmt
,
lpeg
.
Ct
30 31
if
not
characters
then
require
(
"
char-def
"
)
end
32
if
not
characters
.
blocks
then
require
(
"
char-ini
"
)
end
33 34
local
lpegmatch
=
lpeg
.
match
35
local
lpegpatterns
=
lpeg
.
patterns
36
local
p_utf8character
=
lpegpatterns
.
utf8character
37
local
p_utf8byte
=
lpegpatterns
.
utf8byte
38
local
utfchartabletopattern
=
lpeg
.
utfchartabletopattern
39 40
local
formatters
=
string
.
formatters
41 42
local
allocate
=
utilities
.
storage
.
allocate
or
function
(
)
return
{
}
end
43
local
mark
=
utilities
.
storage
.
mark
or
allocate
44 45
local
charfromnumber
=
characters
.
fromnumber
46 47
characters
=
characters
or
{
}
48
local
characters
=
characters
49 50
local
filters
=
allocate
(
)
51
characters
.
filters
=
filters
52 53
local
utffilters
=
{
}
54
characters
.
filters
.
utf
=
utffilters
55 56
local
data
=
characters
.
data
57 58
--[[ldx-- 59<p>It only makes sense to collapse at runtime, since we don't expect source code 60to depend on collapsing.</p> 61--ldx]]
--
62 63
-- for the moment, will be entries in char-def.lua .. this is just a subset that for
64
-- typographic (font) reasons we want to have split ... if we decompose all, we get
65
-- problems with fonts
66 67
local
decomposed
=
allocate
{
68
[
"
IJ
"
]
=
"
IJ
"
,
69
[
"
ij
"
]
=
"
ij
"
,
70
[
"
և
"
]
=
"
եւ
"
,
71
[
"
"
]
=
"
ff
"
,
72
[
"
"
]
=
"
fi
"
,
73
[
"
"
]
=
"
fl
"
,
74
[
"
"
]
=
"
ffi
"
,
75
[
"
"
]
=
"
ffl
"
,
76
[
"
"
]
=
"
ſt
"
,
77
[
"
"
]
=
"
st
"
,
78
[
"
"
]
=
"
մն
"
,
79
[
"
"
]
=
"
մե
"
,
80
[
"
"
]
=
"
մի
"
,
81
[
"
"
]
=
"
վն
"
,
82
[
"
"
]
=
"
մխ
"
,
83
}
84 85
characters
.
decomposed
=
decomposed
86 87
local
graphemes
=
characters
.
graphemes
88
local
collapsed
=
characters
.
collapsed
89
local
combined
=
characters
.
combined
90
local
mathlists
=
characters
.
mathlists
91 92
if
graphemes
then
93 94
mark
(
graphemes
)
95
mark
(
collapsed
)
96
mark
(
combined
)
97
mark
(
mathlists
)
98 99
else
100 101
graphemes
=
allocate
(
)
102
collapsed
=
allocate
(
)
103
combined
=
allocate
(
)
104
mathlists
=
allocate
(
)
105 106
characters
.
graphemes
=
graphemes
107
characters
.
collapsed
=
collapsed
108
characters
.
combined
=
combined
109
characters
.
mathlists
=
mathlists
110 111
local
function
backtrack
(
v
,
last
,
target
)
112
local
vs
=
v
.
specials
113
if
vs
and
#
vs
=
=
3
then
114
local
kind
=
vs
[
1
]
115
if
kind
=
=
"
char
"
or
kind
=
=
"
with
"
then
116
local
one
=
vs
[
2
]
117
local
two
=
vs
[
3
]
118
local
first
=
utfchar
(
one
)
119
local
second
=
utfchar
(
two
)
.
.
last
120
collapsed
[
first
.
.
second
]
=
target
121
backtrack
(
data
[
one
]
,
second
,
target
)
122
end
123
end
124
end
125 126
local
function
setlist
(
unicode
,
list
,
start
,
category
)
127
if
list
[
start
]
~
=
0x20
then
128
local
t
=
mathlists
129
for
i
=
start
,
#
list
do
130
local
l
=
list
[
i
]
131
local
f
=
t
[
l
]
132
if
f
then
133
t
=
f
134
else
135
f
=
{
}
136
t
[
l
]
=
f
137
t
=
f
138
end
139
end
140
t
[
category
]
=
unicode
141
end
142
end
143 144
local
mlists
=
{
}
145 146
for
unicode
,
v
in
next
,
data
do
147
local
vs
=
v
.
specials
148
if
vs
then
149
local
kind
=
vs
[
1
]
150
local
size
=
#
vs
151
if
kind
=
=
"
char
"
or
char
=
=
"
with
"
then
-- with added
152
if
size
=
=
3
then
153
local
one
=
vs
[
2
]
154
local
two
=
vs
[
3
]
155
local
first
=
utfchar
(
one
)
156
local
second
=
utfchar
(
two
)
157
local
combination
=
utfchar
(
unicode
)
158
--
159
collapsed
[
first
.
.
second
]
=
combination
160
backtrack
(
data
[
one
]
,
second
,
combination
)
161
-- sort of obsolete:
162
local
cgf
=
graphemes
[
first
]
163
if
not
cgf
then
164
cgf
=
{
[
second
]
=
combination
}
165
graphemes
[
first
]
=
cgf
166
else
167
cgf
[
second
]
=
combination
168
end
169
end
170
if
size
>
2
and
(
v
.
mathclass
or
v
.
mathspec
)
then
171
setlist
(
unicode
,
vs
,
2
,
"
specials
"
)
172
end
173
elseif
kind
=
=
"
with
"
then
174
if
size
=
=
3
then
175
combined
[
utfchar
(
vs
[
2
]
)
.
.
utfchar
(
vs
[
3
]
)
]
=
utfchar
(
unicode
)
176
end
177
elseif
kind
=
=
"
compat
"
then
178
if
size
=
=
3
then
179
combined
[
utfchar
(
vs
[
2
]
)
.
.
utfchar
(
vs
[
3
]
)
]
=
utfchar
(
unicode
)
180
end
181
if
size
>
2
and
(
v
.
mathclass
or
v
.
mathspec
)
then
182
setlist
(
unicode
,
vs
,
2
,
"
specials
"
)
183
end
184
end
185
end
186
local
ml
=
v
.
mathlist
187
if
ml
then
188
mlists
[
unicode
]
=
ml
189
end
190
end
191 192
-- these win:
193 194
for
unicode
,
ml
in
next
,
mlists
do
195
setlist
(
unicode
,
ml
,
1
,
"
mathlist
"
)
196
end
197 198
mlists
=
nil
199 200
if
storage
then
201
storage
.
register
(
"
characters/graphemes
"
,
graphemes
,
"
characters.graphemes
"
)
202
storage
.
register
(
"
characters/collapsed
"
,
collapsed
,
"
characters.collapsed
"
)
203
storage
.
register
(
"
characters/combined
"
,
combined
,
"
characters.combined
"
)
204
storage
.
register
(
"
characters/mathlists
"
,
mathlists
,
"
characters.mathlists
"
)
205
end
206 207
end
208 209
function
characters
.
initialize
(
)
end
-- dummy
210 211
local
skippable
=
{
}
212
local
filesuffix
=
file
.
suffix
213 214
function
utffilters
.
setskippable
(
suffix
,
value
)
215
if
value
=
=
nil
then
216
value
=
true
217
end
218
if
type
(
suffix
)
=
=
"
table
"
then
219
for
i
=
1
,
#
suffix
do
220
skippable
[
suffix
[
i
]
]
=
value
221
end
222
else
223
skippable
[
suffix
]
=
value
224
end
225
end
226 227
local
p_collapse
=
nil
-- so we can reset if needed
228 229
local
function
prepare
(
)
230
local
tree
=
utfchartabletopattern
(
collapsed
)
231
-- p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1))
232
p_collapse
=
Cs
(
(
tree
/
collapsed
+
p_utf8character
)
^
0
)
233
end
234 235
function
utffilters
.
collapse
(
str
,
filename
)
236
if
not
p_collapse
then
237
prepare
(
)
238
end
239
if
not
str
or
str
=
=
"
"
or
#
str
=
=
1
then
240
return
str
241
elseif
filename
and
skippable
[
filesuffix
(
filename
)
]
then
-- we could hash the collapsables or do a quicker test
242
return
str
243
else
244
return
lpegmatch
(
p_collapse
,
str
)
or
str
245
end
246
end
247 248
local
p_combine
=
nil
-- only for tex
249 250
local
function
prepare
(
)
251
local
tree
=
utfchartabletopattern
(
combined
)
252
p_combine
=
Cs
(
(
tree
/
combined
+
p_utf8character
)
^
0
)
253
end
254 255
function
utffilters
.
combine
(
str
)
-- not in files
256
-- we could merge collapse into combine ... maybe
257
if
not
p_combine
then
258
prepare
(
)
259
end
260
if
not
str
or
str
=
=
"
"
or
#
str
=
=
1
then
261
return
str
262
else
263
return
lpegmatch
(
p_combine
,
str
)
or
str
264
end
265
end
266 267
local
p_decompose
=
nil
268 269
local
function
prepare
(
)
270
local
tree
=
utfchartabletopattern
(
decomposed
)
271
p_decompose
=
Cs
(
(
tree
/
decomposed
+
p_utf8character
)
^
0
*
P
(
-1
)
)
272
end
273 274
function
utffilters
.
decompose
(
str
,
filename
)
-- 3 to 4 times faster than the above
275
if
not
p_decompose
then
276
prepare
(
)
277
end
278
if
str
and
str
~
=
"
"
and
#
str
>
1
then
279
return
lpegmatch
(
p_decompose
,
str
)
280
end
281
if
not
str
or
str
=
=
"
"
or
#
str
<
2
then
282
return
str
283
elseif
filename
and
skippable
[
filesuffix
(
filename
)
]
then
284
return
str
285
else
286
return
lpegmatch
(
p_decompose
,
str
)
or
str
287
end
288
return
str
289
end
290 291
-- utffilters.addgrapheme(utfchar(318),'l','\string~')
292
-- utffilters.addgrapheme('c','a','b')
293 294
function
utffilters
.
addgrapheme
(
result
,
first
,
second
)
-- can be U+ 0x string or utf or number
295
local
result
=
charfromnumber
(
result
)
296
local
first
=
charfromnumber
(
first
)
297
local
second
=
charfromnumber
(
second
)
298
if
not
graphemes
[
first
]
then
299
graphemes
[
first
]
=
{
[
second
]
=
result
}
300
else
301
graphemes
[
first
]
[
second
]
=
result
302
end
303
local
pair
=
first
.
.
second
304
if
not
collapsed
[
pair
]
then
305
collapsed
[
pair
]
=
result
306
p_composed
=
nil
307
end
308
end
309 310
if
interfaces
then
-- eventually this goes to char-ctx.lua
311 312
interfaces
.
implement
{
313
name
=
"
addgrapheme
"
,
314
actions
=
utffilters
.
addgrapheme
,
315
arguments
=
"
3 strings
"
,
316
}
317 318
end
319 320
-- --
321 322
local
p_reorder
=
nil
323 324
-- local sorter = function(a,b) return b[2] < a[2] end
325
--
326
-- local function swapper(s,p,t)
327
-- local old = { }
328
-- for i=1,#t do
329
-- old[i] = t[i][1]
330
-- end
331
-- old = concat(old)
332
-- sort(t,sorter)
333
-- for i=1,#t do
334
-- t[i] = t[i][1]
335
-- end
336
-- local new = concat(t)
337
-- if old ~= new then
338
-- print("reordered",old,"->",new)
339
-- end
340
-- return p, new
341
-- end
342 343
-- -- the next one into stable for similar weights
344 345
local
sorter
=
function
(
a
,
b
)
346
return
b
[
2
]
<
a
[
2
]
347
end
348 349
local
function
swapper
(
s
,
p
,
t
)
350
sort
(
t
,
sorter
)
351
for
i
=
1
,
#
t
do
352
t
[
i
]
=
t
[
i
]
[
1
]
353
end
354
return
p
,
concat
(
t
)
355
end
356 357
-- -- the next one keeps similar weights in the original order
358
--
359
-- local sorter = function(a,b)
360
-- local b2, a2 = b[2], a[2]
361
-- if a2 == b2 then
362
-- return b[3] > a[3]
363
-- else
364
-- return b2 < a2
365
-- end
366
-- end
367
--
368
-- local function swapper(s,p,t)
369
-- for i=1,#t do
370
-- t[i][3] = i
371
-- end
372
-- sort(t,sorter)
373
-- for i=1,#t do
374
-- t[i] = t[i][1]
375
-- end
376
-- return p, concat(t)
377
-- end
378 379
-- at some point exceptions will become an option, for now it's an experiment
380
-- to overcome bugs (that have become features) in unicode .. or we might decide
381
-- for an extra ordering key in char-def that takes precedence over combining
382 383
local
exceptions
=
{
384
-- frozen unicode bug
385
[
"
َّ
"
]
=
"
َّ
"
,
-- U+64E .. U+651 => U+651 .. U+64E
386
}
387 388
local
function
prepare
(
)
389
local
hash
=
{
}
390
for
k
,
v
in
sortedhash
(
characters
.
data
)
do
391
local
combining
=
v
.
combining
-- v.ordering or v.combining
392
if
combining
then
393
local
u
=
utfchar
(
k
)
394
hash
[
u
]
=
{
u
,
combining
,
0
}
-- slot 3 can be used in sort
395
end
396
end
397
local
e
=
utfchartabletopattern
(
exceptions
)
398
local
p
=
utfchartabletopattern
(
hash
)
399
p_reorder
=
Cs
(
(
e
/
exceptions
+
Cmt
(
Ct
(
(
p
/
hash
)
^
2
)
,
swapper
)
+
p_utf8character
)
^
0
)
*
P
(
-1
)
400
end
401 402
function
utffilters
.
reorder
(
str
,
filename
)
403
if
not
p_reorder
then
404
prepare
(
)
405
end
406
if
not
str
or
str
=
=
"
"
or
#
str
<
2
then
407
return
str
408
elseif
filename
and
skippable
[
filesuffix
(
filename
)
]
then
409
return
str
410
else
411
return
lpegmatch
(
p_reorder
,
str
)
or
str
412
end
413
return
str
414
end
415 416
-- local collapse = utffilters.collapse
417
-- local decompose = utffilters.decompose
418
-- local reorder = utffilters.reorder
419
--
420
-- local c1, c2, c3 = "a", "̂", "̃"
421
-- local r2, r3 = "â", "ẫ"
422
-- local l1 = "ffl"
423
--
424
-- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1
425
-- local res = r3 .. " " .. r2 .. " " .. "ffl"
426
--
427
-- local text = io.loaddata("t:/sources/tufte.tex")
428
--
429
-- local function test(n)
430
-- local data = text .. string.rep(str,100) .. text
431
-- local okay = text .. string.rep(res,100) .. text
432
-- local t = os.clock()
433
-- for i=1,10000 do
434
-- collapse(data)
435
-- decompose(data)
436
-- -- reorder(data)
437
-- end
438
-- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str)))
439
-- end
440
--
441
-- test(050)
442
-- test(150)
443
--
444
-- local old = "foo" .. string.char(0xE1) .. "bar"
445
-- local new = collapse(old)
446
-- print(old,new)
447 448
-- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old)
449
-- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old)
450
--
451
-- print(one_old,two_old,one_old==two_old,false)
452
-- print(one_new,two_new,one_new==two_new,true)
453
--
454
-- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar"
455
-- local done = utffilters.reorder(test)
456
--
457
-- print(test,done,test==done,false)
458 459
local
f_default
=
formatters
[
"
[%U]
"
]
460
local
f_description
=
formatters
[
"
[%s]
"
]
461 462
local
function
convert
(
n
)
463
local
d
=
data
[
n
]
464
d
=
d
and
d
.
description
465
if
d
then
466
return
f_description
(
d
)
467
else
468
return
f_default
(
n
)
469
end
470
end
471 472
local
pattern
=
Cs
(
(
p_utf8byte
/
convert
)
^
1
)
473 474
function
utffilters
.
verbose
(
data
)
475
return
data
and
lpegmatch
(
pattern
,
data
)
or
"
"
476
end
477 478
return
characters
479