l-unicode.lua /size: 40 Kb    last modification: 2020-07-01 14:35
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
l-unicode
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to luat-lib.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
-- floor(b/256) => rshift(b, 8)
10
-- floor(b/1024) => rshift(b,10)
11 12
-- in lua 5.3:
13 14
-- utf8.char(···) : concatinated
15
-- utf8.charpatt : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
16
-- utf8.codes(s) : for p, c in utf8.codes(s) do body end
17
-- utf8.codepoint(s [, i [, j]])
18
-- utf8.len(s [, i])
19
-- utf8.offset(s, n [, i])
20 21
-- todo: utf.sub replacement (used in syst-aux)
22
-- we put these in the utf namespace:
23 24
-- used : byte char len lower sub upper
25
-- not used : dump find format gmatch gfind gsub match rep reverse
26 27
-- utf = utf or (unicode and unicode.utf8) or { }
28 29
-- not supported:
30
--
31
-- dump, find, format, gfind, gmatch, gsub, lower, match, rep, reverse, upper
32 33
utf
=
utf
or
{
}
34
unicode
=
nil
35 36
if
not
string
.
utfcharacters
then
37 38
-- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
39
-- than the built-in string.utfcharacters.
40 41
local
gmatch
=
string
.
gmatch
42 43
function
string
.
characters
(
str
)
44
return
gmatch
(
str
,
"
.[\128-\191]*
"
)
45
end
46 47 48
end
49 50
utf
.
characters
=
string
.
utfcharacters
51 52
-- string.utfvalues
53
-- string.utfcharacters
54
-- string.characters
55
-- string.characterpairs
56
-- string.bytes
57
-- string.bytepairs
58
-- string.utflength
59
-- string.utfvalues
60
-- string.utfcharacters
61 62
local
type
=
type
63
local
char
,
byte
,
format
,
sub
,
gmatch
=
string
.
char
,
string
.
byte
,
string
.
format
,
string
.
sub
,
string
.
gmatch
64
local
concat
=
table
.
concat
65
local
P
,
C
,
R
,
Cs
,
Ct
,
Cmt
,
Cc
,
Carg
,
Cp
=
lpeg
.
P
,
lpeg
.
C
,
lpeg
.
R
,
lpeg
.
Cs
,
lpeg
.
Ct
,
lpeg
.
Cmt
,
lpeg
.
Cc
,
lpeg
.
Carg
,
lpeg
.
Cp
66 67
local
lpegmatch
=
lpeg
.
match
68
local
patterns
=
lpeg
.
patterns
69
local
tabletopattern
=
lpeg
.
utfchartabletopattern
70 71
local
bytepairs
=
string
.
bytepairs
72 73
local
finder
=
lpeg
.
finder
74
local
replacer
=
lpeg
.
replacer
75 76
local
p_utftype
=
patterns
.
utftype
77
local
p_utfstricttype
=
patterns
.
utfstricttype
78
local
p_utfoffset
=
patterns
.
utfoffset
79
local
p_utf8character
=
patterns
.
utf8character
80
local
p_utf8char
=
patterns
.
utf8char
81
local
p_utf8byte
=
patterns
.
utf8byte
82
local
p_utfbom
=
patterns
.
utfbom
83
local
p_newline
=
patterns
.
newline
84
local
p_whitespace
=
patterns
.
whitespace
85 86
-- if not unicode then
87
-- unicode = { utf = utf } -- for a while
88
-- end
89 90
if
not
utf
.
char
then
91 92
utf
.
char
=
string
.
utfcharacter
or
(
utf8
and
utf8
.
char
)
93 94
if
not
utf
.
char
then
95 96
-- no multiples
97 98
local
char
=
string
.
char
99 100
if
bit32
then
101 102
local
rshift
=
bit32
.
rshift
103 104
function
utf
.
char
(
n
)
105
if
n
<
0x80
then
106
-- 0aaaaaaa : 0x80
107
return
char
(
n
)
108
elseif
n
<
0x800
then
109
-- 110bbbaa : 0xC0 : n >> 6
110
-- 10aaaaaa : 0x80 : n & 0x3F
111
return
char
(
112
0xC0
+
rshift
(
n
,
6
)
,
113
0x80
+
(
n
%
0x40
)
114
)
115
elseif
n
<
0x10000
then
116
-- 1110bbbb : 0xE0 : n >> 12
117
-- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
118
-- 10aaaaaa : 0x80 : n & 0x3F
119
return
char
(
120
0xE0
+
rshift
(
n
,
12
)
,
121
0x80
+
(
rshift
(
n
,
6
)
%
0x40
)
,
122
0x80
+
(
n
%
0x40
)
123
)
124
elseif
n
<
0x200000
then
125
-- 11110ccc : 0xF0 : n >> 18
126
-- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
127
-- 10bbbbaa : 0x80 : (n >> 6) & 0x3F
128
-- 10aaaaaa : 0x80 : n & 0x3F
129
-- dddd : ccccc - 1
130
return
char
(
131
0xF0
+
rshift
(
n
,
18
)
,
132
0x80
+
(
rshift
(
n
,
12
)
%
0x40
)
,
133
0x80
+
(
rshift
(
n
,
6
)
%
0x40
)
,
134
0x80
+
(
n
%
0x40
)
135
)
136
else
137
return
"
"
138
end
139
end
140 141
else
142 143
local
floor
=
math
.
floor
144 145
function
utf
.
char
(
n
)
146
if
n
<
0x80
then
147
return
char
(
n
)
148
elseif
n
<
0x800
then
149
return
char
(
150
0xC0
+
floor
(
n
/
0x40
)
,
151
0x80
+
(
n
%
0x40
)
152
)
153
elseif
n
<
0x10000
then
154
return
char
(
155
0xE0
+
floor
(
n
/
0x1000
)
,
156
0x80
+
(
floor
(
n
/
0x40
)
%
0x40
)
,
157
0x80
+
(
n
%
0x40
)
158
)
159
elseif
n
<
0x200000
then
160
return
char
(
161
0xF0
+
floor
(
n
/
0x40000
)
,
162
0x80
+
(
floor
(
n
/
0x1000
)
%
0x40
)
,
163
0x80
+
(
floor
(
n
/
0x40
)
%
0x40
)
,
164
0x80
+
(
n
%
0x40
)
165
)
166
else
167
return
"
"
168
end
169
end
170 171
end
172 173
end
174 175
end
176 177
if
not
utf
.
byte
then
178 179
utf
.
byte
=
string
.
utfvalue
or
(
utf8
and
utf8
.
codepoint
)
180 181
if
not
utf
.
byte
then
182 183
function
utf
.
byte
(
c
)
184
return
lpegmatch
(
p_utf8byte
,
c
)
185
end
186 187
end
188 189
end
190 191
local
utfchar
,
utfbyte
=
utf
.
char
,
utf
.
byte
192 193
-- As we want to get rid of the (unmaintained) utf library we implement our own
194
-- variants (in due time an independent module):
195 196
function
utf
.
filetype
(
data
)
197
return
data
and
lpegmatch
(
p_utftype
,
data
)
or
"
unknown
"
198
end
199 200
local
toentities
=
Cs
(
201
(
202
patterns
.
utf8one
203
+
(
204
patterns
.
utf8two
205
+
patterns
.
utf8three
206
+
patterns
.
utf8four
207
)
/
function
(
s
)
local
b
=
utfbyte
(
s
)
if
b
<
127
then
return
s
else
return
format
(
"
&#%X;
"
,
b
)
end
end
208
)
^
0
209
)
210 211
patterns
.
toentities
=
toentities
212 213
function
utf
.
toentities
(
str
)
214
return
lpegmatch
(
toentities
,
str
)
215
end
216 217
-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
218
--
219
-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
220
--
221
-- collectgarbage("collect")
222
-- local u = collectgarbage("count")*1024
223
-- local t = os.clock()
224
-- for i=1,1000 do
225
-- for i=1,600 do
226
-- local a = utfchr[i]
227
-- end
228
-- end
229
-- print(os.clock()-t,collectgarbage("count")*1024-u)
230 231
-- collectgarbage("collect")
232
-- local t = os.clock()
233
-- for i=1,1000 do
234
-- for i=1,600 do
235
-- local a = utfchar(i)
236
-- end
237
-- end
238
-- print(os.clock()-t,collectgarbage("count")*1024-u)
239 240
-- local byte = string.byte
241
-- local utfchar = utf.char
242 243
local
one
=
P
(
1
)
244
local
two
=
C
(
1
)
*
C
(
1
)
245
local
four
=
C
(
R
(
utfchar
(
0xD8
)
,
utfchar
(
0xFF
)
)
)
*
C
(
1
)
*
C
(
1
)
*
C
(
1
)
246 247
local
pattern
=
P
(
"
\254\255
"
)
*
Cs
(
(
248
four
/
function
(
a
,
b
,
c
,
d
)
249
local
ab
=
0xFF
*
byte
(
a
)
+
byte
(
b
)
250
local
cd
=
0xFF
*
byte
(
c
)
+
byte
(
d
)
251
return
utfchar
(
(
ab
-0xD800
)
*
0x400
+
(
cd
-0xDC00
)
+
0x10000
)
252
end
253
+
two
/
function
(
a
,
b
)
254
return
utfchar
(
byte
(
a
)
*
256
+
byte
(
b
)
)
255
end
256
+
one
257
)
^
1
)
258
+
P
(
"
\255\254
"
)
*
Cs
(
(
259
four
/
function
(
b
,
a
,
d
,
c
)
260
local
ab
=
0xFF
*
byte
(
a
)
+
byte
(
b
)
261
local
cd
=
0xFF
*
byte
(
c
)
+
byte
(
d
)
262
return
utfchar
(
(
ab
-0xD800
)
*
0x400
+
(
cd
-0xDC00
)
+
0x10000
)
263
end
264
+
two
/
function
(
b
,
a
)
265
return
utfchar
(
byte
(
a
)
*
256
+
byte
(
b
)
)
266
end
267
+
one
268
)
^
1
)
269 270
function
string
.
toutf
(
s
)
-- in string namespace
271
return
lpegmatch
(
pattern
,
s
)
or
s
-- todo: utf32
272
end
273 274
local
validatedutf
=
Cs
(
275
(
276
patterns
.
utf8one
277
+
patterns
.
utf8two
278
+
patterns
.
utf8three
279
+
patterns
.
utf8four
280
+
P
(
1
)
/
"
"
281
)
^
0
282
)
283 284
patterns
.
validatedutf
=
validatedutf
285 286
function
utf
.
is_valid
(
str
)
287
return
type
(
str
)
=
=
"
string
"
and
lpegmatch
(
validatedutf
,
str
)
or
false
288
end
289 290
if
not
utf
.
len
then
291 292
utf
.
len
=
string
.
utflength
or
(
utf8
and
utf8
.
len
)
293 294
if
not
utf
.
len
then
295 296
-- -- alternative 1: 0.77
297
--
298
-- local utfcharcounter = utfbom^-1 * Cs((p_utf8character/'!')^0)
299
--
300
-- function utf.len(str)
301
-- return #lpegmatch(utfcharcounter,str or "")
302
-- end
303
--
304
-- -- alternative 2: 1.70
305
--
306
-- local n = 0
307
--
308
-- local utfcharcounter = utfbom^-1 * (p_utf8character/function() n = n + 1 end)^0 -- slow
309
--
310
-- function utf.length(str)
311
-- n = 0
312
-- lpegmatch(utfcharcounter,str or "")
313
-- return n
314
-- end
315
--
316
-- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
317 318
-- local n = 0
319
--
320
-- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
321
-- -- patterns.utf8one ^1 * Cc(1)
322
-- -- + patterns.utf8two ^1 * Cc(2)
323
-- -- + patterns.utf8three^1 * Cc(3)
324
-- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
325
-- -- )^0 ) -- just as many captures as below
326
--
327
-- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
328
-- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end))
329
-- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end))
330
-- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
331
-- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
332
-- -- )^0 ) -- not interesting as it creates strings but sometimes faster
333
--
334
-- -- The best so far:
335
--
336
-- local utfcharcounter = utfbom^-1 * P ( (
337
-- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end
338
-- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
339
-- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
340
-- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
341
-- )^0 )
342 343
-- function utf.len(str)
344
-- n = 0
345
-- lpegmatch(utfcharcounter,str or "")
346
-- return n
347
-- end
348 349
local
n
,
f
=
0
,
1
350 351
local
utfcharcounter
=
patterns
.
utfbom
^
-1
*
Cmt
(
352
Cc
(
1
)
*
patterns
.
utf8one
^
1
353
+
Cc
(
2
)
*
patterns
.
utf8two
^
1
354
+
Cc
(
3
)
*
patterns
.
utf8three
^
1
355
+
Cc
(
4
)
*
patterns
.
utf8four
^
1
,
356
function
(
_
,
t
,
d
)
-- due to Cc no string captures, so faster
357
n
=
n
+
(
t
-
f
)
/
d
358
f
=
t
359
return
true
360
end
361
)
^
0
362 363
function
utf
.
len
(
str
)
364
n
,
f
=
0
,
1
365
lpegmatch
(
utfcharcounter
,
str
or
"
"
)
366
return
n
367
end
368 369
-- -- these are quite a bit slower:
370 371
-- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
372
-- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
373 374
end
375 376
end
377 378
utf
.
length
=
utf
.
len
379 380
if
not
utf
.
sub
then
381 382
-- inefficient as lpeg just copies ^n
383 384
-- local function sub(str,start,stop)
385
-- local pattern = p_utf8character^-(start-1) * C(p_utf8character^-(stop-start+1))
386
-- inspect(pattern)
387
-- return lpegmatch(pattern,str) or ""
388
-- end
389 390
-- local b, e, n, first, last = 0, 0, 0, 0, 0
391
--
392
-- local function slide(s,p)
393
-- n = n + 1
394
-- if n == first then
395
-- b = p
396
-- if not last then
397
-- return nil
398
-- end
399
-- end
400
-- if n == last then
401
-- e = p
402
-- return nil
403
-- else
404
-- return p
405
-- end
406
-- end
407
--
408
-- local pattern = Cmt(p_utf8character,slide)^0
409
--
410
-- function utf.sub(str,start,stop) -- todo: from the end
411
-- if not start then
412
-- return str
413
-- end
414
-- b, e, n, first, last = 0, 0, 0, start, stop
415
-- lpegmatch(pattern,str)
416
-- if not stop then
417
-- return sub(str,b)
418
-- else
419
-- return sub(str,b,e-1)
420
-- end
421
-- end
422 423
-- print(utf.sub("Hans Hagen is my name"))
424
-- print(utf.sub("Hans Hagen is my name",5))
425
-- print(utf.sub("Hans Hagen is my name",5,10))
426 427
local
utflength
=
utf
.
length
428 429
-- also negative indices, upto 10 times slower than a c variant
430 431
local
b
,
e
,
n
,
first
,
last
=
0
,
0
,
0
,
0
,
0
432 433
local
function
slide_zero
(
s
,
p
)
434
n
=
n
+
1
435
if
n
>
=
last
then
436
e
=
p
-
1
437
else
438
return
p
439
end
440
end
441 442
local
function
slide_one
(
s
,
p
)
443
n
=
n
+
1
444
if
n
=
=
first
then
445
b
=
p
446
end
447
if
n
>
=
last
then
448
e
=
p
-
1
449
else
450
return
p
451
end
452
end
453 454
local
function
slide_two
(
s
,
p
)
455
n
=
n
+
1
456
if
n
=
=
first
then
457
b
=
p
458
else
459
return
true
460
end
461
end
462 463
local
pattern_zero
=
Cmt
(
p_utf8character
,
slide_zero
)
^
0
464
local
pattern_one
=
Cmt
(
p_utf8character
,
slide_one
)
^
0
465
local
pattern_two
=
Cmt
(
p_utf8character
,
slide_two
)
^
0
466 467
local
pattern_first
=
C
(
p_utf8character
)
468 469
function
utf
.
sub
(
str
,
start
,
stop
)
470
if
not
start
then
471
return
str
472
end
473
if
start
=
=
0
then
474
start
=
1
475
end
476
if
not
stop
then
477
if
start
<
0
then
478
local
l
=
utflength
(
str
)
-- we can inline this function if needed
479
start
=
l
+
start
480
else
481
start
=
start
-
1
482
end
483
b
,
n
,
first
=
0
,
0
,
start
484
lpegmatch
(
pattern_two
,
str
)
485
if
n
>
=
first
then
486
return
sub
(
str
,
b
)
487
else
488
return
"
"
489
end
490
end
491
if
start
<
0
or
stop
<
0
then
492
local
l
=
utf
.
length
(
str
)
493
if
start
<
0
then
494
start
=
l
+
start
495
if
start
<
=
0
then
496
start
=
1
497
else
498
start
=
start
+
1
499
end
500
end
501
if
stop
<
0
then
502
stop
=
l
+
stop
503
if
stop
=
=
0
then
504
stop
=
1
505
else
506
stop
=
stop
+
1
507
end
508
end
509
end
510
if
start
=
=
1
and
stop
=
=
1
then
511
return
lpegmatch
(
pattern_first
,
str
)
or
"
"
512
elseif
start
>
stop
then
513
return
"
"
514
elseif
start
>
1
then
515
b
,
e
,
n
,
first
,
last
=
0
,
0
,
0
,
start
-
1
,
stop
516
lpegmatch
(
pattern_one
,
str
)
517
if
n
>
=
first
and
e
=
=
0
then
518
e
=
#
str
519
end
520
return
sub
(
str
,
b
,
e
)
521
else
522
b
,
e
,
n
,
last
=
1
,
0
,
0
,
stop
523
lpegmatch
(
pattern_zero
,
str
)
524
if
e
=
=
0
then
525
e
=
#
str
526
end
527
return
sub
(
str
,
b
,
e
)
528
end
529
end
530 531
-- local n = 100000
532
-- local str = string.rep("123456àáâãäå",100)
533
--
534
-- for i=-15,15,1 do
535
-- for j=-15,15,1 do
536
-- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
537
-- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
538
-- end
539
-- end
540
-- if utf.xsub(str,i) ~= utf.sub(str,i) then
541
-- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
542
-- end
543
-- end
544 545
-- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
546
-- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
547
-- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
548
-- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 ))
549
-- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 ))
550
-- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
551
-- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
552
-- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
553
-- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
554
-- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
555
-- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
556
-- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 ))
557 558
end
559 560
-- a replacement for simple gsubs:
561 562
-- function utf.remapper(mapping)
563
-- local pattern = Cs((p_utf8character/mapping)^0)
564
-- return function(str)
565
-- if not str or str == "" then
566
-- return ""
567
-- else
568
-- return lpegmatch(pattern,str)
569
-- end
570
-- end, pattern
571
-- end
572 573
function
utf
.
remapper
(
mapping
,
option
,
action
)
-- static also returns a pattern
574
local
variant
=
type
(
mapping
)
575
if
variant
=
=
"
table
"
then
576
action
=
action
or
mapping
577
if
option
=
=
"
dynamic
"
then
578
local
pattern
=
false
579
table
.
setmetatablenewindex
(
mapping
,
function
(
t
,
k
,
v
)
rawset
(
t
,
k
,
v
)
pattern
=
false
end
)
580
return
function
(
str
)
581
if
not
str
or
str
=
=
"
"
then
582
return
"
"
583
else
584
if
not
pattern
then
585
pattern
=
Cs
(
(
tabletopattern
(
mapping
)
/
action
+
p_utf8character
)
^
0
)
586
end
587
return
lpegmatch
(
pattern
,
str
)
588
end
589
end
590
elseif
option
=
=
"
pattern
"
then
591
return
Cs
(
(
tabletopattern
(
mapping
)
/
action
+
p_utf8character
)
^
0
)
592
-- elseif option == "static" then
593
else
594
local
pattern
=
Cs
(
(
tabletopattern
(
mapping
)
/
action
+
p_utf8character
)
^
0
)
595
return
function
(
str
)
596
if
not
str
or
str
=
=
"
"
then
597
return
"
"
598
else
599
return
lpegmatch
(
pattern
,
str
)
600
end
601
end
,
pattern
602
end
603
elseif
variant
=
=
"
function
"
then
604
if
option
=
=
"
pattern
"
then
605
return
Cs
(
(
p_utf8character
/
mapping
+
p_utf8character
)
^
0
)
606
else
607
local
pattern
=
Cs
(
(
p_utf8character
/
mapping
+
p_utf8character
)
^
0
)
608
return
function
(
str
)
609
if
not
str
or
str
=
=
"
"
then
610
return
"
"
611
else
612
return
lpegmatch
(
pattern
,
str
)
613
end
614
end
,
pattern
615
end
616
else
617
-- is actually an error
618
return
function
(
str
)
619
return
str
or
"
"
620
end
621
end
622
end
623 624
-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
625
-- print(remap("abcd 1234 abcd"))
626 627
function
utf
.
replacer
(
t
)
-- no precheck, always string builder
628
local
r
=
replacer
(
t
,
false
,
false
,
true
)
629
return
function
(
str
)
630
return
lpegmatch
(
r
,
str
)
631
end
632
end
633 634
function
utf
.
subtituter
(
t
)
-- with precheck and no building if no match
635
local
f
=
finder
(
t
)
636
local
r
=
replacer
(
t
,
false
,
false
,
true
)
637
return
function
(
str
)
638
local
i
=
lpegmatch
(
f
,
str
)
639
if
not
i
then
640
return
str
641
elseif
i
>
#
str
then
642
return
str
643
else
644
-- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
645
return
lpegmatch
(
r
,
str
)
646
end
647
end
648
end
649 650
-- inspect(utf.split("a b c d"))
651
-- inspect(utf.split("a b c d",true))
652 653
local
utflinesplitter
=
p_utfbom
^
-1
*
lpeg
.
tsplitat
(
p_newline
)
654
local
utfcharsplitter_ows
=
p_utfbom
^
-1
*
Ct
(
C
(
p_utf8character
)
^
0
)
655
local
utfcharsplitter_iws
=
p_utfbom
^
-1
*
Ct
(
(
p_whitespace
^
1
+
C
(
p_utf8character
)
)
^
0
)
656
local
utfcharsplitter_raw
=
Ct
(
C
(
p_utf8character
)
^
0
)
657 658
patterns
.
utflinesplitter
=
utflinesplitter
659 660
function
utf
.
splitlines
(
str
)
661
return
lpegmatch
(
utflinesplitter
,
str
or
"
"
)
662
end
663 664
function
utf
.
split
(
str
,
ignorewhitespace
)
-- new
665
if
ignorewhitespace
then
666
return
lpegmatch
(
utfcharsplitter_iws
,
str
or
"
"
)
667
else
668
return
lpegmatch
(
utfcharsplitter_ows
,
str
or
"
"
)
669
end
670
end
671 672
function
utf
.
totable
(
str
)
-- keeps bom
673
return
lpegmatch
(
utfcharsplitter_raw
,
str
)
674
end
675 676
-- 0 EF BB BF UTF-8
677
-- 1 FF FE UTF-16-little-endian
678
-- 2 FE FF UTF-16-big-endian
679
-- 3 FF FE 00 00 UTF-32-little-endian
680
-- 4 00 00 FE FF UTF-32-big-endian
681
--
682
-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
683 684
-- utf.name = {
685
-- [0] = 'utf-8',
686
-- [1] = 'utf-16-le',
687
-- [2] = 'utf-16-be',
688
-- [3] = 'utf-32-le',
689
-- [4] = 'utf-32-be'
690
-- }
691
--
692
-- function utf.magic(f)
693
-- local str = f:read(4)
694
-- if not str then
695
-- f:seek('set')
696
-- return 0
697
-- -- elseif find(str,"^%z%z\254\255") then -- depricated
698
-- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged
699
-- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH)
700
-- return 4
701
-- -- elseif find(str,"^\255\254%z%z") then -- depricated
702
-- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged
703
-- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH)
704
-- return 3
705
-- elseif find(str,"^\254\255") then
706
-- f:seek('set',2)
707
-- return 2
708
-- elseif find(str,"^\255\254") then
709
-- f:seek('set',2)
710
-- return 1
711
-- elseif find(str,"^\239\187\191") then
712
-- f:seek('set',3)
713
-- return 0
714
-- else
715
-- f:seek('set')
716
-- return 0
717
-- end
718
-- end
719 720
function
utf
.
magic
(
f
)
-- not used
721
local
str
=
f
:
read
(
4
)
or
"
"
722
local
off
=
lpegmatch
(
p_utfoffset
,
str
)
723
if
off
<
4
then
724
f
:
seek
(
'
set
'
,
off
)
725
end
726
return
lpegmatch
(
p_utftype
,
str
)
727
end
728 729
local
utf16_to_utf8_be
,
utf16_to_utf8_le
730
local
utf32_to_utf8_be
,
utf32_to_utf8_le
731 732
local
utf_16_be_getbom
=
patterns
.
utfbom_16_be
^
-1
733
local
utf_16_le_getbom
=
patterns
.
utfbom_16_le
^
-1
734
local
utf_32_be_getbom
=
patterns
.
utfbom_32_be
^
-1
735
local
utf_32_le_getbom
=
patterns
.
utfbom_32_le
^
-1
736 737
local
utf_16_be_linesplitter
=
utf_16_be_getbom
*
lpeg
.
tsplitat
(
patterns
.
utf_16_be_nl
)
738
local
utf_16_le_linesplitter
=
utf_16_le_getbom
*
lpeg
.
tsplitat
(
patterns
.
utf_16_le_nl
)
739
local
utf_32_be_linesplitter
=
utf_32_be_getbom
*
lpeg
.
tsplitat
(
patterns
.
utf_32_be_nl
)
740
local
utf_32_le_linesplitter
=
utf_32_le_getbom
*
lpeg
.
tsplitat
(
patterns
.
utf_32_le_nl
)
741 742
-- we have three possibilities: bytepairs (using tables), gmatch (using tables), gsub and
743
-- lpeg. Bytepairs are the fastert but as soon as we need to remove bombs and so the gain
744
-- is less due to more testing. Also, we seldom have to convert utf16 so we don't care to
745
-- much about a few milliseconds more runtime. The lpeg variant is upto 20% slower but
746
-- still pretty fast.
747
--
748
-- for historic resone we keep the bytepairs variants around .. beware they don't grab the
749
-- bom like the lpegs do so they're not dropins in the functions that follow
750
--
751
-- utf16_to_utf8_be = function(s)
752
-- if not s then
753
-- return nil
754
-- elseif s == "" then
755
-- return ""
756
-- end
757
-- local result, r, more = { }, 0, 0
758
-- for left, right in bytepairs(s) do
759
-- if right then
760
-- local now = 256*left + right
761
-- if more > 0 then
762
-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
763
-- more = 0
764
-- r = r + 1
765
-- result[r] = utfchar(now)
766
-- elseif now >= 0xD800 and now <= 0xDBFF then
767
-- more = now
768
-- else
769
-- r = r + 1
770
-- result[r] = utfchar(now)
771
-- end
772
-- end
773
-- end
774
-- return concat(result)
775
-- end
776
--
777
-- local utf16_to_utf8_be_t = function(t)
778
-- if not t then
779
-- return nil
780
-- elseif type(t) == "string" then
781
-- t = lpegmatch(utf_16_be_linesplitter,t)
782
-- end
783
-- local result = { } -- we reuse result
784
-- for i=1,#t do
785
-- local s = t[i]
786
-- if s ~= "" then
787
-- local r, more = 0, 0
788
-- for left, right in bytepairs(s) do
789
-- if right then
790
-- local now = 256*left + right
791
-- if more > 0 then
792
-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
793
-- more = 0
794
-- r = r + 1
795
-- result[r] = utfchar(now)
796
-- elseif now >= 0xD800 and now <= 0xDBFF then
797
-- more = now
798
-- else
799
-- r = r + 1
800
-- result[r] = utfchar(now)
801
-- end
802
-- end
803
-- end
804
-- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
805
-- end
806
-- end
807
-- return t
808
-- end
809
--
810
-- utf16_to_utf8_le = function(s)
811
-- if not s then
812
-- return nil
813
-- elseif s == "" then
814
-- return ""
815
-- end
816
-- local result, r, more = { }, 0, 0
817
-- for left, right in bytepairs(s) do
818
-- if right then
819
-- local now = 256*right + left
820
-- if more > 0 then
821
-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
822
-- more = 0
823
-- r = r + 1
824
-- result[r] = utfchar(now)
825
-- elseif now >= 0xD800 and now <= 0xDBFF then
826
-- more = now
827
-- else
828
-- r = r + 1
829
-- result[r] = utfchar(now)
830
-- end
831
-- end
832
-- end
833
-- return concat(result)
834
-- end
835
--
836
-- local utf16_to_utf8_le_t = function(t)
837
-- if not t then
838
-- return nil
839
-- elseif type(t) == "string" then
840
-- t = lpegmatch(utf_16_le_linesplitter,t)
841
-- end
842
-- local result = { } -- we reuse result
843
-- for i=1,#t do
844
-- local s = t[i]
845
-- if s ~= "" then
846
-- local r, more = 0, 0
847
-- for left, right in bytepairs(s) do
848
-- if right then
849
-- local now = 256*right + left
850
-- if more > 0 then
851
-- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
852
-- more = 0
853
-- r = r + 1
854
-- result[r] = utfchar(now)
855
-- elseif now >= 0xD800 and now <= 0xDBFF then
856
-- more = now
857
-- else
858
-- r = r + 1
859
-- result[r] = utfchar(now)
860
-- end
861
-- end
862
-- end
863
-- t[i] = concat(result,"",1,r) -- we reused tmp, hence t
864
-- end
865
-- end
866
-- return t
867
-- end
868
--
869
-- local utf32_to_utf8_be_t = function(t)
870
-- if not t then
871
-- return nil
872
-- elseif type(t) == "string" then
873
-- t = lpegmatch(utflinesplitter,t)
874
-- end
875
-- local result = { } -- we reuse result
876
-- for i=1,#t do
877
-- local r, more = 0, -1
878
-- for a,b in bytepairs(t[i]) do
879
-- if a and b then
880
-- if more < 0 then
881
-- more = 256*256*256*a + 256*256*b
882
-- else
883
-- r = r + 1
884
-- result[t] = utfchar(more + 256*a + b)
885
-- more = -1
886
-- end
887
-- else
888
-- break
889
-- end
890
-- end
891
-- t[i] = concat(result,"",1,r)
892
-- end
893
-- return t
894
-- end
895
--
896
-- local utf32_to_utf8_le_t = function(t)
897
-- if not t then
898
-- return nil
899
-- elseif type(t) == "string" then
900
-- t = lpegmatch(utflinesplitter,t)
901
-- end
902
-- local result = { } -- we reuse result
903
-- for i=1,#t do
904
-- local r, more = 0, -1
905
-- for a,b in bytepairs(t[i]) do
906
-- if a and b then
907
-- if more < 0 then
908
-- more = 256*b + a
909
-- else
910
-- r = r + 1
911
-- result[t] = utfchar(more + 256*256*256*b + 256*256*a)
912
-- more = -1
913
-- end
914
-- else
915
-- break
916
-- end
917
-- end
918
-- t[i] = concat(result,"",1,r)
919
-- end
920
-- return t
921
-- end
922 923
local
more
=
0
924 925
local
p_utf16_to_utf8_be
=
C
(
1
)
*
C
(
1
)
/
function
(
left
,
right
)
926
local
now
=
256
*
byte
(
left
)
+
byte
(
right
)
927
if
more
>
0
then
928
now
=
(
more
-0xD800
)
*
0x400
+
(
now
-0xDC00
)
+
0x10000
929
more
=
0
930
return
utfchar
(
now
)
931
elseif
now
>
=
0xD800
and
now
<
=
0xDBFF
then
932
more
=
now
933
return
"
"
-- else the c's end up in the stream
934
else
935
return
utfchar
(
now
)
936
end
937
end
938 939
local
p_utf16_to_utf8_le
=
C
(
1
)
*
C
(
1
)
/
function
(
right
,
left
)
940
local
now
=
256
*
byte
(
left
)
+
byte
(
right
)
941
if
more
>
0
then
942
now
=
(
more
-0xD800
)
*
0x400
+
(
now
-0xDC00
)
+
0x10000
943
more
=
0
944
return
utfchar
(
now
)
945
elseif
now
>
=
0xD800
and
now
<
=
0xDBFF
then
946
more
=
now
947
return
"
"
-- else the c's end up in the stream
948
else
949
return
utfchar
(
now
)
950
end
951
end
952
local
p_utf32_to_utf8_be
=
C
(
1
)
*
C
(
1
)
*
C
(
1
)
*
C
(
1
)
/
function
(
a
,
b
,
c
,
d
)
953
return
utfchar
(
256
*
256
*
256
*
byte
(
a
)
+
256
*
256
*
byte
(
b
)
+
256
*
byte
(
c
)
+
byte
(
d
)
)
954
end
955 956
local
p_utf32_to_utf8_le
=
C
(
1
)
*
C
(
1
)
*
C
(
1
)
*
C
(
1
)
/
function
(
a
,
b
,
c
,
d
)
957
return
utfchar
(
256
*
256
*
256
*
byte
(
d
)
+
256
*
256
*
byte
(
c
)
+
256
*
byte
(
b
)
+
byte
(
a
)
)
958
end
959 960
p_utf16_to_utf8_be
=
P
(
true
)
/
function
(
)
more
=
0
end
*
utf_16_be_getbom
*
Cs
(
p_utf16_to_utf8_be
^
0
)
961
p_utf16_to_utf8_le
=
P
(
true
)
/
function
(
)
more
=
0
end
*
utf_16_le_getbom
*
Cs
(
p_utf16_to_utf8_le
^
0
)
962
p_utf32_to_utf8_be
=
P
(
true
)
/
function
(
)
more
=
0
end
*
utf_32_be_getbom
*
Cs
(
p_utf32_to_utf8_be
^
0
)
963
p_utf32_to_utf8_le
=
P
(
true
)
/
function
(
)
more
=
0
end
*
utf_32_le_getbom
*
Cs
(
p_utf32_to_utf8_le
^
0
)
964 965
patterns
.
utf16_to_utf8_be
=
p_utf16_to_utf8_be
966
patterns
.
utf16_to_utf8_le
=
p_utf16_to_utf8_le
967
patterns
.
utf32_to_utf8_be
=
p_utf32_to_utf8_be
968
patterns
.
utf32_to_utf8_le
=
p_utf32_to_utf8_le
969 970
utf16_to_utf8_be
=
function
(
s
)
971
if
s
and
s
~
=
"
"
then
972
return
lpegmatch
(
p_utf16_to_utf8_be
,
s
)
973
else
974
return
s
975
end
976
end
977 978
local
utf16_to_utf8_be_t
=
function
(
t
)
979
if
not
t
then
980
return
nil
981
elseif
type
(
t
)
=
=
"
string
"
then
982
t
=
lpegmatch
(
utf_16_be_linesplitter
,
t
)
983
end
984
for
i
=
1
,
#
t
do
985
local
s
=
t
[
i
]
986
if
s
~
=
"
"
then
987
t
[
i
]
=
lpegmatch
(
p_utf16_to_utf8_be
,
s
)
988
end
989
end
990
return
t
991
end
992 993
utf16_to_utf8_le
=
function
(
s
)
994
if
s
and
s
~
=
"
"
then
995
return
lpegmatch
(
p_utf16_to_utf8_le
,
s
)
996
else
997
return
s
998
end
999
end
1000 1001
local
utf16_to_utf8_le_t
=
function
(
t
)
1002
if
not
t
then
1003
return
nil
1004
elseif
type
(
t
)
=
=
"
string
"
then
1005
t
=
lpegmatch
(
utf_16_le_linesplitter
,
t
)
1006
end
1007
for
i
=
1
,
#
t
do
1008
local
s
=
t
[
i
]
1009
if
s
~
=
"
"
then
1010
t
[
i
]
=
lpegmatch
(
p_utf16_to_utf8_le
,
s
)
1011
end
1012
end
1013
return
t
1014
end
1015 1016
utf32_to_utf8_be
=
function
(
s
)
1017
if
s
and
s
~
=
"
"
then
1018
return
lpegmatch
(
p_utf32_to_utf8_be
,
s
)
1019
else
1020
return
s
1021
end
1022
end
1023 1024
local
utf32_to_utf8_be_t
=
function
(
t
)
1025
if
not
t
then
1026
return
nil
1027
elseif
type
(
t
)
=
=
"
string
"
then
1028
t
=
lpegmatch
(
utf_32_be_linesplitter
,
t
)
1029
end
1030
for
i
=
1
,
#
t
do
1031
local
s
=
t
[
i
]
1032
if
s
~
=
"
"
then
1033
t
[
i
]
=
lpegmatch
(
p_utf32_to_utf8_be
,
s
)
1034
end
1035
end
1036
return
t
1037
end
1038 1039
utf32_to_utf8_le
=
function
(
s
)
1040
if
s
and
s
~
=
"
"
then
1041
return
lpegmatch
(
p_utf32_to_utf8_le
,
s
)
1042
else
1043
return
s
1044
end
1045
end
1046 1047
local
utf32_to_utf8_le_t
=
function
(
t
)
1048
if
not
t
then
1049
return
nil
1050
elseif
type
(
t
)
=
=
"
string
"
then
1051
t
=
lpegmatch
(
utf_32_le_linesplitter
,
t
)
1052
end
1053
for
i
=
1
,
#
t
do
1054
local
s
=
t
[
i
]
1055
if
s
~
=
"
"
then
1056
t
[
i
]
=
lpegmatch
(
p_utf32_to_utf8_le
,
s
)
1057
end
1058
end
1059
return
t
1060
end
1061 1062
utf
.
utf16_to_utf8_le_t
=
utf16_to_utf8_le_t
1063
utf
.
utf16_to_utf8_be_t
=
utf16_to_utf8_be_t
1064
utf
.
utf32_to_utf8_le_t
=
utf32_to_utf8_le_t
1065
utf
.
utf32_to_utf8_be_t
=
utf32_to_utf8_be_t
1066 1067
utf
.
utf16_to_utf8_le
=
utf16_to_utf8_le
1068
utf
.
utf16_to_utf8_be
=
utf16_to_utf8_be
1069
utf
.
utf32_to_utf8_le
=
utf32_to_utf8_le
1070
utf
.
utf32_to_utf8_be
=
utf32_to_utf8_be
1071 1072
function
utf
.
utf8_to_utf8_t
(
t
)
1073
return
type
(
t
)
=
=
"
string
"
and
lpegmatch
(
utflinesplitter
,
t
)
or
t
1074
end
1075 1076
function
utf
.
utf16_to_utf8_t
(
t
,
endian
)
1077
return
endian
and
utf16_to_utf8_be_t
(
t
)
or
utf16_to_utf8_le_t
(
t
)
or
t
1078
end
1079 1080
function
utf
.
utf32_to_utf8_t
(
t
,
endian
)
1081
return
endian
and
utf32_to_utf8_be_t
(
t
)
or
utf32_to_utf8_le_t
(
t
)
or
t
1082
end
1083 1084
if
bit32
then
1085 1086
local
rshift
=
bit32
.
rshift
1087 1088
local
function
little
(
b
)
1089
if
b
<
0x10000
then
1090
return
char
(
b
%
256
,
rshift
(
b
,
8
)
)
1091
else
1092
b
=
b
-
0x10000
1093
local
b1
=
rshift
(
b
,
10
)
+
0xD800
1094
local
b2
=
b
%
1024
+
0xDC00
1095
return
char
(
b1
%
256
,
rshift
(
b1
,
8
)
,
b2
%
256
,
rshift
(
b2
,
8
)
)
1096
end
1097
end
1098 1099
local
function
big
(
b
)
1100
if
b
<
0x10000
then
1101
return
char
(
rshift
(
b
,
8
)
,
b
%
256
)
1102
else
1103
b
=
b
-
0x10000
1104
local
b1
=
rshift
(
b
,
10
)
+
0xD800
1105
local
b2
=
b
%
1024
+
0xDC00
1106
return
char
(
rshift
(
b1
,
8
)
,
b1
%
256
,
rshift
(
b2
,
8
)
,
b2
%
256
)
1107
end
1108
end
1109 1110
local
l_remap
=
Cs
(
(
p_utf8byte
/
little
+
P
(
1
)
/
"
"
)
^
0
)
1111
local
b_remap
=
Cs
(
(
p_utf8byte
/
big
+
P
(
1
)
/
"
"
)
^
0
)
1112 1113
local
function
utf8_to_utf16_be
(
str
,
nobom
)
1114
if
nobom
then
1115
return
lpegmatch
(
b_remap
,
str
)
1116
else
1117
return
char
(
254
,
255
)
.
.
lpegmatch
(
b_remap
,
str
)
1118
end
1119
end
1120 1121
local
function
utf8_to_utf16_le
(
str
,
nobom
)
1122
if
nobom
then
1123
return
lpegmatch
(
l_remap
,
str
)
1124
else
1125
return
char
(
255
,
254
)
.
.
lpegmatch
(
l_remap
,
str
)
1126
end
1127
end
1128 1129
utf
.
utf8_to_utf16_be
=
utf8_to_utf16_be
1130
utf
.
utf8_to_utf16_le
=
utf8_to_utf16_le
1131 1132
function
utf
.
utf8_to_utf16
(
str
,
littleendian
,
nobom
)
1133
if
littleendian
then
1134
return
utf8_to_utf16_le
(
str
,
nobom
)
1135
else
1136
return
utf8_to_utf16_be
(
str
,
nobom
)
1137
end
1138
end
1139 1140
end
1141 1142
local
pattern
=
Cs
(
1143
(
p_utf8byte
/
function
(
unicode
)
return
format
(
"
0x%04X
"
,
unicode
)
end
)
*
1144
(
p_utf8byte
*
Carg
(
1
)
/
function
(
unicode
,
separator
)
return
format
(
"
%s0x%04X
"
,
separator
,
unicode
)
end
)
^
0
1145
)
1146 1147
function
utf
.
tocodes
(
str
,
separator
)
1148
return
lpegmatch
(
pattern
,
str
,
1
,
separator
or
"
"
)
1149
end
1150 1151
function
utf
.
ustring
(
s
)
1152
return
format
(
"
U+%05X
"
,
type
(
s
)
=
=
"
number
"
and
s
or
utfbyte
(
s
)
)
1153
end
1154 1155
function
utf
.
xstring
(
s
)
1156
return
format
(
"
0x%05X
"
,
type
(
s
)
=
=
"
number
"
and
s
or
utfbyte
(
s
)
)
1157
end
1158 1159
function
utf
.
toeight
(
str
)
1160
if
not
str
or
str
=
=
"
"
then
1161
return
nil
1162
end
1163
local
utftype
=
lpegmatch
(
p_utfstricttype
,
str
)
1164
if
utftype
=
=
"
utf-8
"
then
1165
return
sub
(
str
,
4
)
-- remove the bom
1166
elseif
utftype
=
=
"
utf-16-be
"
then
1167
return
utf16_to_utf8_be
(
str
)
-- bom gets removed
1168
elseif
utftype
=
=
"
utf-16-le
"
then
1169
return
utf16_to_utf8_le
(
str
)
-- bom gets removed
1170
else
1171
return
str
1172
end
1173
end
1174 1175
--
1176 1177
do
1178 1179
local
p_nany
=
p_utf8character
/
"
"
1180
local
cache
=
{
}
1181 1182
function
utf
.
count
(
str
,
what
)
1183
if
type
(
what
)
=
=
"
string
"
then
1184
local
p
=
cache
[
what
]
1185
if
not
p
then
1186
p
=
Cs
(
(
P
(
what
)
/
"
"
+
p_nany
)
^
0
)
1187
cache
[
p
]
=
p
1188
end
1189
return
#
lpegmatch
(
p
,
str
)
1190
else
-- 4 times slower but still faster than / function
1191
return
#
lpegmatch
(
Cs
(
(
P
(
what
)
/
"
"
+
p_nany
)
^
0
)
,
str
)
1192
end
1193
end
1194 1195
end
1196 1197
if
not
string
.
utfvalues
then
1198 1199
-- So, a logical next step is to check for the values variant. It over five times
1200
-- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
1201 1202
----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch
1203
local
find
=
string
.
find
1204 1205
local
dummy
=
function
(
)
1206
-- we share this one
1207
end
1208 1209
-- function string.utfvalues(str)
1210
-- local n = #str
1211
-- if n == 0 then
1212
-- return wrap(dummy)
1213
-- elseif n == 1 then
1214
-- return wrap(function() yield(utfbyte(str)) end)
1215
-- else
1216
-- return wrap(function() for s in gmatch(str,".[\128-\191]*") do
1217
-- yield(utfbyte(s))
1218
-- end end)
1219
-- end
1220
-- end
1221
--
1222
-- faster:
1223 1224
function
string
.
utfvalues
(
str
)
1225
local
n
=
#
str
1226
if
n
=
=
0
then
1227
return
dummy
1228
elseif
n
=
=
1
then
1229
return
function
(
)
return
utfbyte
(
str
)
end
1230
else
1231
local
p
=
1
1232
-- local n = #str
1233
return
function
(
)
1234
-- if p <= n then -- slower than the last find
1235
local
b
,
e
=
find
(
str
,
"
.[\128-\191]*
"
,
p
)
1236
if
b
then
1237
p
=
e
+
1
1238
return
utfbyte
(
sub
(
str
,
b
,
e
)
)
1239
end
1240
-- end
1241
end
1242
end
1243
end
1244 1245
-- slower:
1246
--
1247
-- local pattern = C(p_utf8character) * Cp()
1248
-- ----- pattern = p_utf8character/utfbyte * Cp()
1249
-- ----- pattern = p_utf8byte * Cp()
1250
--
1251
-- function string.utfvalues(str) -- one of the cases where a find is faster than an lpeg
1252
-- local n = #str
1253
-- if n == 0 then
1254
-- return dummy
1255
-- elseif n == 1 then
1256
-- return function() return utfbyte(str) end
1257
-- else
1258
-- local p = 1
1259
-- return function()
1260
-- local s, e = lpegmatch(pattern,str,p)
1261
-- if e then
1262
-- p = e
1263
-- return utfbyte(s)
1264
-- -- return s
1265
-- end
1266
-- end
1267
-- end
1268
-- end
1269 1270
end
1271 1272
utf
.
values
=
string
.
utfvalues
1273 1274
function
utf
.
chrlen
(
u
)
-- u is number
1275
return
1276
(
u
<
0x80
and
1
)
or
1277
(
u
<
0xE0
and
2
)
or
1278
(
u
<
0xF0
and
3
)
or
1279
(
u
<
0xF8
and
4
)
or
1280
(
u
<
0xFC
and
5
)
or
1281
(
u
<
0xFE
and
6
)
or
0
1282
end
1283 1284
-- hashing saves a little but not that much in practice
1285
--
1286
-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end)
1287 1288
if
bit32
then
1289 1290
local
extract
=
bit32
.
extract
1291
local
char
=
string
.
char
1292 1293
function
utf
.
toutf32string
(
n
)
1294
if
n
<
=
0xFF
then
1295
return
1296
char
(
n
)
.
.
1297
"
\000\000\000
"
1298
elseif
n
<
=
0xFFFF
then
1299
return
1300
char
(
extract
(
n
,
0
,
8
)
)
.
.
1301
char
(
extract
(
n
,
8
,
8
)
)
.
.
1302
"
\000\000
"
1303
elseif
n
<
=
0xFFFFFF
then
1304
return
1305
char
(
extract
(
n
,
0
,
8
)
)
.
.
1306
char
(
extract
(
n
,
8
,
8
)
)
.
.
1307
char
(
extract
(
n
,
16
,
8
)
)
.
.
1308
"
\000
"
1309
else
1310
return
1311
char
(
extract
(
n
,
0
,
8
)
)
.
.
1312
char
(
extract
(
n
,
8
,
8
)
)
.
.
1313
char
(
extract
(
n
,
16
,
8
)
)
.
.
1314
char
(
extract
(
n
,
24
,
8
)
)
1315
end
1316
end
1317 1318
end
1319 1320
-- goodie:
1321 1322
local
len
=
utf
.
len
1323
local
rep
=
rep
1324 1325
function
string
.
utfpadd
(
s
,
n
)
1326
if
n
and
n
~
=
0
then
1327
local
l
=
len
(
s
)
1328
if
n
>
0
then
1329
local
d
=
n
-
l
1330
if
d
>
0
then
1331
return
rep
(
c
or
"
"
,
d
)
.
.
s
1332
end
1333
else
1334
local
d
=
-
n
-
l
1335
if
d
>
0
then
1336
return
s
.
.
rep
(
c
or
"
"
,
d
)
1337
end
1338
end
1339
end
1340
return
s
1341
end
1342 1343
-- goodies
1344 1345
do
1346 1347
local
utfcharacters
=
utf
.
characters
or
string
.
utfcharacters
1348
local
utfchar
=
utf
.
char
or
string
.
utfcharacter
1349 1350
lpeg
.
UP
=
P
1351 1352
if
utfcharacters
then
1353 1354
function
lpeg
.
US
(
str
)
1355
local
p
=
P
(
false
)
1356
for
uc
in
utfcharacters
(
str
)
do
1357
p
=
p
+
P
(
uc
)
1358
end
1359
return
p
1360
end
1361 1362
else
1363 1364
function
lpeg
.
US
(
str
)
1365
local
p
=
P
(
false
)
1366
local
f
=
function
(
uc
)
1367
p
=
p
+
P
(
uc
)
1368
end
1369
lpegmatch
(
(
p_utf8char
/
f
)
^
0
,
str
)
1370
return
p
1371
end
1372 1373
end
1374 1375
local
range
=
p_utf8byte
*
p_utf8byte
+
Cc
(
false
)
-- utf8byte is already a capture
1376 1377
function
lpeg
.
UR
(
str
,
more
)
1378
local
first
,
last
1379
if
type
(
str
)
=
=
"
number
"
then
1380
first
=
str
1381
last
=
more
or
first
1382
else
1383
first
,
last
=
lpegmatch
(
range
,
str
)
1384
if
not
last
then
1385
return
P
(
str
)
1386
end
1387
end
1388
if
first
=
=
last
then
1389
return
P
(
str
)
1390
end
1391
if
not
utfchar
then
1392
utfchar
=
utf
.
char
-- maybe delayed
1393
end
1394
if
utfchar
and
(
last
-
first
<
8
)
then
-- a somewhat arbitrary criterium
1395
local
p
=
P
(
false
)
1396
for
i
=
first
,
last
do
1397
p
=
p
+
P
(
utfchar
(
i
)
)
1398
end
1399
return
p
-- nil when invalid range
1400
else
1401
local
f
=
function
(
b
)
1402
return
b
>
=
first
and
b
<
=
last
1403
end
1404
-- tricky, these nested captures
1405
return
p_utf8byte
/
f
-- nil when invalid range
1406
end
1407
end
1408 1409
-- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
1410 1411
end
1412