regi-ini.lua /size: 13 Kb    last modification: 2020-07-01 14:35
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
regi-ini
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to regi-ini.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
--[[ldx-- 10<p>Regimes take care of converting the input characters into 11<l n='utf'/> sequences. The conversion tables are loaded at 12runtime.</p> 13--ldx]]
--
14 15
-- Todo: use regi-imp*.lua instead
16 17
local
commands
,
context
=
commands
,
context
18 19 20
local
tostring
=
tostring
21
local
utfchar
=
utf
.
char
22
local
P
,
Cs
,
Cc
,
lpegmatch
=
lpeg
.
P
,
lpeg
.
Cs
,
lpeg
.
Cc
,
lpeg
.
match
23
local
char
,
gsub
,
format
,
gmatch
,
byte
,
match
,
lower
=
string
.
char
,
string
.
gsub
,
string
.
format
,
string
.
gmatch
,
string
.
byte
,
string
.
match
,
string
.
lower
24
local
next
=
next
25
local
insert
,
remove
,
fastcopy
=
table
.
insert
,
table
.
remove
,
table
.
fastcopy
26
local
concat
=
table
.
concat
27
local
totable
=
string
.
totable
28 29
local
allocate
=
utilities
.
storage
.
allocate
30
local
sequencers
=
utilities
.
sequencers
31
local
textlineactions
=
resolvers
.
openers
.
helpers
.
textlineactions
32
local
setmetatableindex
=
table
.
setmetatableindex
33 34
--[[ldx-- 35<p>We will hook regime handling code into the input methods.</p> 36--ldx]]
--
37 38
local
trace_translating
=
false
trackers
.
register
(
"
regimes.translating
"
,
function
(
v
)
trace_translating
=
v
end
)
39 40
local
report_loading
=
logs
.
reporter
(
"
regimes
"
,
"
loading
"
)
41
local
report_translating
=
logs
.
reporter
(
"
regimes
"
,
"
translating
"
)
42 43
regimes
=
regimes
or
{
}
44
local
regimes
=
regimes
45 46
local
mapping
=
allocate
{
47
utf
=
false
48
}
49 50
local
backmapping
=
allocate
{
51
}
52 53
-- regimes.mapping = mapping
54 55
local
synonyms
=
{
-- backward compatibility list
56 57
[
"
windows-1250
"
]
=
"
cp1250
"
,
58
[
"
windows-1251
"
]
=
"
cp1251
"
,
59
[
"
windows-1252
"
]
=
"
cp1252
"
,
60
[
"
windows-1253
"
]
=
"
cp1253
"
,
61
[
"
windows-1254
"
]
=
"
cp1254
"
,
62
[
"
windows-1255
"
]
=
"
cp1255
"
,
63
[
"
windows-1256
"
]
=
"
cp1256
"
,
64
[
"
windows-1257
"
]
=
"
cp1257
"
,
65
[
"
windows-1258
"
]
=
"
cp1258
"
,
66 67
[
"
il1
"
]
=
"
8859-1
"
,
68
[
"
il2
"
]
=
"
8859-2
"
,
69
[
"
il3
"
]
=
"
8859-3
"
,
70
[
"
il4
"
]
=
"
8859-4
"
,
71
[
"
il5
"
]
=
"
8859-9
"
,
72
[
"
il6
"
]
=
"
8859-10
"
,
73
[
"
il7
"
]
=
"
8859-13
"
,
74
[
"
il8
"
]
=
"
8859-14
"
,
75
[
"
il9
"
]
=
"
8859-15
"
,
76
[
"
il10
"
]
=
"
8859-16
"
,
77 78
[
"
iso-8859-1
"
]
=
"
8859-1
"
,
79
[
"
iso-8859-2
"
]
=
"
8859-2
"
,
80
[
"
iso-8859-3
"
]
=
"
8859-3
"
,
81
[
"
iso-8859-4
"
]
=
"
8859-4
"
,
82
[
"
iso-8859-9
"
]
=
"
8859-9
"
,
83
[
"
iso-8859-10
"
]
=
"
8859-10
"
,
84
[
"
iso-8859-13
"
]
=
"
8859-13
"
,
85
[
"
iso-8859-14
"
]
=
"
8859-14
"
,
86
[
"
iso-8859-15
"
]
=
"
8859-15
"
,
87
[
"
iso-8859-16
"
]
=
"
8859-16
"
,
88 89
[
"
latin1
"
]
=
"
8859-1
"
,
90
[
"
latin2
"
]
=
"
8859-2
"
,
91
[
"
latin3
"
]
=
"
8859-3
"
,
92
[
"
latin4
"
]
=
"
8859-4
"
,
93
[
"
latin5
"
]
=
"
8859-9
"
,
94
[
"
latin6
"
]
=
"
8859-10
"
,
95
[
"
latin7
"
]
=
"
8859-13
"
,
96
[
"
latin8
"
]
=
"
8859-14
"
,
97
[
"
latin9
"
]
=
"
8859-15
"
,
98
[
"
latin10
"
]
=
"
8859-16
"
,
99 100
[
"
utf-8
"
]
=
"
utf
"
,
101
[
"
utf8
"
]
=
"
utf
"
,
102
[
"
"
]
=
"
utf
"
,
103 104
[
"
windows
"
]
=
"
cp1252
"
,
105 106
[
"
pdf
"
]
=
"
pdfdoc
"
,
107 108
[
"
437
"
]
=
"
ibm
"
,
109
}
110 111
local
currentregime
=
"
utf
"
112 113
local
function
loadregime
(
mapping
,
regime
)
114
regime
=
lower
(
tostring
(
regime
)
)
115
regime
=
synonyms
[
regime
]
or
synonyms
[
"
windows-
"
.
.
regime
]
or
regime
116
local
name
=
resolvers
.
findfile
(
format
(
"
regi-%s.lua
"
,
regime
)
)
or
"
"
117
local
data
=
name
~
=
"
"
and
dofile
(
name
)
118
if
data
then
119
vector
=
{
}
120
for
eightbit
,
unicode
in
next
,
data
do
121
vector
[
char
(
eightbit
)
]
=
utfchar
(
unicode
)
122
end
123
report_loading
(
"
vector %a is loaded
"
,
regime
)
124
else
125
vector
=
false
126
report_loading
(
"
vector %a is unknown
"
,
regime
)
127
end
128
mapping
[
regime
]
=
vector
129
return
vector
130
end
131 132
local
function
loadreverse
(
t
,
k
)
133
local
t
=
{
}
134
local
m
=
mapping
[
k
]
135
if
m
then
136
for
k
,
v
in
next
,
m
do
137
t
[
v
]
=
k
138
end
139
end
140
backmapping
[
k
]
=
t
141
return
t
142
end
143 144
setmetatableindex
(
mapping
,
loadregime
)
145
setmetatableindex
(
backmapping
,
loadreverse
)
146 147
regimes
.
mapping
=
mapping
148
regimes
.
backmapping
=
backmapping
149 150
local
function
fromregime
(
regime
,
line
)
151
if
line
and
#
line
>
0
then
152
-- local map = mapping[regime and synonyms[regime] or regime or currentregime]
153
local
map
=
mapping
[
regime
or
currentregime
]
154
if
map
then
155
line
=
gsub
(
line
,
"
.
"
,
map
)
156
end
157
end
158
return
line
159
end
160 161
-- local remappers = { }
162
--
163
-- local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
164
-- local t = backmapping[vector]
165
-- local remapper = remappers[vector]
166
-- if not remapper then
167
-- remapper = utf.remapper(t)
168
-- remappers[t] = remapper
169
-- end
170
-- local m = getmetatable(t)
171
-- setmetatableindex(t, function(t,k)
172
-- local v = default or "?"
173
-- t[k] = v
174
-- return v
175
-- end)
176
-- str = remapper(str)
177
-- setmetatable(t,m)
178
-- return str
179
-- end
180
--
181
-- -- much faster (but only matters when we have > 10K calls
182 183
local
cache
=
{
}
-- if really needed we can copy vectors and hash defaults
184 185
setmetatableindex
(
cache
,
function
(
t
,
k
)
186
local
v
=
{
remappers
=
{
}
}
187
t
[
k
]
=
v
188
return
v
189
end
)
190 191
local
function
toregime
(
vector
,
str
,
default
)
-- toregime('8859-1',"abcde Ä","?")
192
local
d
=
default
or
"
?
"
193
local
c
=
cache
[
vector
]
.
remappers
194
local
r
=
c
[
d
]
195
if
not
r
then
196
local
t
=
fastcopy
(
backmapping
[
vector
]
)
197
-- r = utf.remapper(t) -- not good for defaults here
198
local
pattern
=
Cs
(
(
lpeg
.
utfchartabletopattern
(
t
)
/
t
+
lpeg
.
patterns
.
utf8character
/
d
+
P
(
1
)
/
d
)
^
0
)
199
r
=
function
(
str
)
200
if
not
str
or
str
=
=
"
"
then
201
return
"
"
202
else
203
return
lpegmatch
(
pattern
,
str
)
204
end
205
end
206
c
[
d
]
=
r
207
end
208
return
r
(
str
)
209
end
210 211
local
function
disable
(
)
212
currentregime
=
"
utf
"
213
sequencers
.
disableaction
(
textlineactions
,
"
regimes.process
"
)
214
return
currentregime
215
end
216 217
local
function
enable
(
regime
)
218
regime
=
synonyms
[
regime
]
or
regime
219
if
mapping
[
regime
]
=
=
false
then
220
disable
(
)
221
else
222
currentregime
=
regime
223
sequencers
.
enableaction
(
textlineactions
,
"
regimes.process
"
)
224
end
225
return
currentregime
226
end
227 228
regimes
.
toregime
=
toregime
229
regimes
.
fromregime
=
fromregime
230
regimes
.
translate
=
function
(
str
,
regime
)
return
fromregime
(
regime
,
str
)
end
231
regimes
.
enable
=
enable
232
regimes
.
disable
=
disable
233 234
-- The following function can be used when we want to make sure that
235
-- utf gets passed unharmed. This is needed for modules.
236 237
local
level
=
0
238 239
function
regimes
.
process
(
str
,
filename
,
currentline
,
noflines
,
coding
)
240
if
level
=
=
0
and
coding
~
=
"
utf-8
"
then
241
str
=
fromregime
(
currentregime
,
str
)
242
if
trace_translating
then
243
report_translating
(
"
utf: %s
"
,
str
)
244
end
245
end
246
return
str
247
end
248 249
local
function
push
(
)
250
level
=
level
+
1
251
if
trace_translating
then
252
report_translating
(
"
pushing level %s
"
,
level
)
253
end
254
end
255 256
local
function
pop
(
)
257
if
level
>
0
then
258
if
trace_translating
then
259
report_translating
(
"
popping level %s
"
,
level
)
260
end
261
level
=
level
-
1
262
end
263
end
264 265
regimes
.
push
=
push
266
regimes
.
pop
=
pop
267 268
function
regimes
.
list
(
)
269
local
name
=
resolvers
.
findfile
(
format
(
"
regi-ini.lua
"
,
regime
)
)
or
"
"
270
local
okay
=
{
}
271
if
name
then
272
local
list
=
dir
.
glob
(
file
.
join
(
file
.
dirname
(
name
)
,
"
regi-*.lua
"
)
)
273
for
i
=
1
,
#
list
do
274
local
name
=
list
[
i
]
275
if
name
~
=
"
regi-ini.lua
"
then
276
okay
[
#
okay
+
1
]
=
match
(
name
,
"
regi%-(.-)%.lua
"
)
277
end
278
table
.
sort
(
okay
)
279
end
280
end
281
return
okay
282
end
283 284
if
sequencers
then
285 286
sequencers
.
prependaction
(
textlineactions
,
"
system
"
,
"
regimes.process
"
)
287
sequencers
.
disableaction
(
textlineactions
,
"
regimes.process
"
)
288 289
end
290 291
-- Next we provide some hacks. Unfortunately we run into crappy encoded
292
-- (read : mixed) encoded xml files that have these ë ä ö ü sequences
293
-- instead of ë ä ö ü
294 295
local
patterns
=
{
}
296 297
-- function regimes.cleanup(regime,str)
298
-- local p = patterns[regime]
299
-- if p == nil then
300
-- regime = regime and synonyms[regime] or regime or currentregime
301
-- local vector = regime ~= "utf" and mapping[regime]
302
-- if vector then
303
-- local list = { }
304
-- for k, uchar in next, vector do
305
-- local stream = totable(uchar)
306
-- for i=1,#stream do
307
-- stream[i] = vector[stream[i]]
308
-- end
309
-- list[concat(stream)] = uchar
310
-- end
311
-- p = lpeg.append(list,nil,true)
312
-- p = Cs((p+1)^0)
313
-- -- lpeg.print(p) -- size 1604
314
-- else
315
-- p = false
316
-- end
317
-- patterns[vector] = p
318
-- end
319
-- return p and lpegmatch(p,str) or str
320
-- end
321
--
322
-- twice as fast and much less lpeg bytecode
323 324
-- function regimes.cleanup(regime,str)
325
-- if not str or str == "" then
326
-- return str
327
-- end
328
-- local p = patterns[regime]
329
-- if p == nil then
330
-- regime = regime and synonyms[regime] or regime or currentregime
331
-- local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
332
-- if vector then
333
-- local utfchars = { }
334
-- local firsts = { }
335
-- for k, uchar in next, vector do
336
-- local stream = { }
337
-- local split = totable(uchar)
338
-- local nofsplits = #split
339
-- if nofsplits > 1 then
340
-- local first
341
-- for i=1,nofsplits do
342
-- local u = vector[split[i]]
343
-- if not first then
344
-- first = firsts[u]
345
-- if not first then
346
-- first = { }
347
-- firsts[u] = first
348
-- end
349
-- end
350
-- stream[i] = u
351
-- end
352
-- local nofstream = #stream
353
-- if nofstream > 1 then
354
-- first[#first+1] = concat(stream,2,nofstream)
355
-- utfchars[concat(stream)] = uchar
356
-- end
357
-- end
358
-- end
359
-- p = P(false)
360
-- for k, v in next, firsts do
361
-- local q = P(false)
362
-- for i=1,#v do
363
-- q = q + P(v[i])
364
-- end
365
-- p = p + P(k) * q
366
-- end
367
-- p = Cs(((p+1)/utfchars)^1)
368
-- -- lpeg.print(p) -- size: 1042
369
-- else
370
-- p = false
371
-- end
372
-- patterns[regime] = p
373
-- end
374
-- return p and lpegmatch(p,str) or str
375
-- end
376
--
377
-- 5 times faster:
378 379
function
regimes
.
cleanup
(
regime
,
str
)
380
if
not
str
or
str
=
=
"
"
then
381
return
str
382
end
383
local
p
=
patterns
[
regime
]
384
if
p
=
=
nil
then
385
regime
=
regime
and
synonyms
[
regime
]
or
regime
or
currentregime
386
local
vector
=
regime
~
=
"
utf
"
and
regime
~
=
"
utf-8
"
and
mapping
[
regime
]
387
if
vector
then
388
local
mapping
=
{
}
389
for
k
,
v
in
next
,
vector
do
390
local
split
=
totable
(
v
)
391
for
i
=
1
,
#
split
do
392
split
[
i
]
=
utfchar
(
byte
(
split
[
i
]
)
)
393
end
394
split
=
concat
(
split
)
395
if
v
~
=
split
then
396
mapping
[
split
]
=
v
397
end
398
end
399
p
=
Cs
(
(
lpeg
.
utfchartabletopattern
(
mapping
)
/
mapping
+
P
(
1
)
)
^
0
)
400
else
401
p
=
false
402
end
403
patterns
[
regime
]
=
p
404
end
405
return
p
and
lpegmatch
(
p
,
str
)
or
str
406
end
407 408
-- local old = [[test ë ä ö ü crap]]
409
-- local new = regimes.cleanup("cp1252",old)
410
-- report_translating("%s -> %s",old,new)
411
-- local old = "Pozn" .. char(0xE1) .. "mky"
412
-- local new = fromregime("cp1250",old)
413
-- report_translating("%s -> %s",old,new)
414 415
-- interface (might move to regi-tex.lua)
416 417
if
interfaces
then
418 419
local
implement
=
interfaces
.
implement
420
local
setmacro
=
interfaces
.
setmacro
421 422
implement
{
423
name
=
"
enableregime
"
,
424
arguments
=
"
string
"
,
425
actions
=
function
(
regime
)
setmacro
(
"
currentregime
"
,
enable
(
regime
)
)
end
426
}
427 428
implement
{
429
name
=
"
disableregime
"
,
430
actions
=
function
(
)
setmacro
(
"
currentregime
"
,
disable
(
)
)
end
431
}
432 433
implement
{
434
name
=
"
pushregime
"
,
435
actions
=
push
436
}
437 438
implement
{
439
name
=
"
popregime
"
,
440
actions
=
pop
441
}
442 443
local
stack
=
{
}
444 445
implement
{
446
name
=
"
startregime
"
,
447
arguments
=
"
string
"
,
448
actions
=
function
(
regime
)
449
insert
(
stack
,
currentregime
)
450
if
trace_translating
then
451
report_translating
(
"
start using %a
"
,
regime
)
452
end
453
setmacro
(
"
currentregime
"
,
enable
(
regime
)
)
454
end
455
}
456 457
implement
{
458
name
=
"
stopregime
"
,
459
actions
=
function
(
)
460
if
#
stack
>
0
then
461
local
regime
=
remove
(
stack
)
462
if
trace_translating
then
463
report_translating
(
"
stop using %a
"
,
regime
)
464
end
465
setmacro
(
"
currentregime
"
,
enable
(
regime
)
)
466
end
467
end
468
}
469 470
end
471 472
-- Actually we can have a function returned from the lookup but we don't
473
-- really use this code so I'm in no hurry.
474 475
-- if os.getcodepage then
476
-- local cod, acp, map
477
-- function os.tocodepage(name)
478
-- if map == nil then
479
-- cod, acp = os.getcodepage()
480
-- map = cod and cod ~= 65001 and regimes.toregime
481
-- end
482
-- return map and map(cod,name) or name
483
-- end
484
-- else
485
-- function os.tocodepage(name)
486
-- return name
487
-- end
488
-- end
489