1if not modules then modules = { } end modules ['l-unicode'] = {
2 version = 1.001,
3 optimize = true,
4 comment = "companion to luat-lib.mkiv",
5 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6 copyright = "PRAGMA ADE / ConTeXt Development Team",
7 license = "see context related readme files"
8}
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34utf = utf or { }
35unicode = nil
36
37if not string.utfcharacters then
38
39
40
41
42 local gmatch = string.gmatch
43
44 function string.characters(str)
45 return gmatch(str,".[\128-\191]*")
46 end
47
48
49end
50
51utf.characters = string.utfcharacters
52
53
54
55
56
57
58
59
60
61
62
63local type = type
64local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
65local concat = table.concat
66local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
67
68local lpegmatch = lpeg.match
69local patterns = lpeg.patterns
70local tabletopattern = lpeg.utfchartabletopattern
71
72local bytepairs = string.bytepairs
73
74local finder = lpeg.finder
75local replacer = lpeg.replacer
76
77local p_utftype = patterns.utftype
78local p_utfstricttype = patterns.utfstricttype
79local p_utfoffset = patterns.utfoffset
80local p_utf8character = patterns.utf8character
81local p_utf8char = patterns.utf8char
82local p_utf8byte = patterns.utf8byte
83local p_utfbom = patterns.utfbom
84local p_newline = patterns.newline
85local p_whitespace = patterns.whitespace
86
87
88
89
90
91if not utf.char then
92
93 utf.char = string.utfcharacter or (utf8 and utf8.char)
94
95 if not utf.char then
96
97
98
99 local char = string.char
100
101 if bit32 then
102
103 local rshift = bit32.rshift
104
105 function utf.char(n)
106 if n < 0x80 then
107
108 return char(n)
109 elseif n < 0x800 then
110
111
112 return char(
113 0xC0 + rshift(n,6),
114 0x80 + (n % 0x40)
115 )
116 elseif n < 0x10000 then
117
118
119
120 return char(
121 0xE0 + rshift(n,12),
122 0x80 + (rshift(n,6) % 0x40),
123 0x80 + (n % 0x40)
124 )
125 elseif n < 0x200000 then
126
127
128
129
130
131 return char(
132 0xF0 + rshift(n,18),
133 0x80 + (rshift(n,12) % 0x40),
134 0x80 + (rshift(n,6) % 0x40),
135 0x80 + (n % 0x40)
136 )
137 else
138 return ""
139 end
140 end
141
142 else
143
144 local floor = math.floor
145
146 function utf.char(n)
147 if n < 0x80 then
148 return char(n)
149 elseif n < 0x800 then
150 return char(
151 0xC0 + floor(n/0x40),
152 0x80 + (n % 0x40)
153 )
154 elseif n < 0x10000 then
155 return char(
156 0xE0 + floor(n/0x1000),
157 0x80 + (floor(n/0x40) % 0x40),
158 0x80 + (n % 0x40)
159 )
160 elseif n < 0x200000 then
161 return char(
162 0xF0 + floor(n/0x40000),
163 0x80 + (floor(n/0x1000) % 0x40),
164 0x80 + (floor(n/0x40) % 0x40),
165 0x80 + (n % 0x40)
166 )
167 else
168 return ""
169 end
170 end
171
172 end
173
174 end
175
176end
177
178if not utf.byte then
179
180 utf.byte = string.utfvalue or (utf8 and utf8.codepoint)
181
182 if not utf.byte then
183
184 function utf.byte(c)
185 return lpegmatch(p_utf8byte,c)
186 end
187
188 end
189
190end
191
192local utfchar, utfbyte = utf.char, utf.byte
193
194
195
196
197function utf.filetype(data)
198 return data and lpegmatch(p_utftype,data) or "unknown"
199end
200
201local toentities = Cs (
202 (
203 patterns.utf8one
204 + (
205 patterns.utf8two
206 + patterns.utf8three
207 + patterns.utf8four
208 ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
209 )^0
210)
211
212patterns.toentities = toentities
213
214function utf.toentities(str)
215 return lpegmatch(toentities,str)
216end
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244local one = P(1)
245local two = C(1) * C(1)
246local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
247
248local pattern = P("\254\255") * Cs( (
249 four / function(a,b,c,d)
250 local ab = 0xFF * byte(a) + byte(b)
251 local cd = 0xFF * byte(c) + byte(d)
252 return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
253 end
254 + two / function(a,b)
255 return utfchar(byte(a)*256 + byte(b))
256 end
257 + one
258 )^1 )
259 + P("\255\254") * Cs( (
260 four / function(b,a,d,c)
261 local ab = 0xFF * byte(a) + byte(b)
262 local cd = 0xFF * byte(c) + byte(d)
263 return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
264 end
265 + two / function(b,a)
266 return utfchar(byte(a)*256 + byte(b))
267 end
268 + one
269 )^1 )
270
271function string.toutf(s)
272 return lpegmatch(pattern,s) or s
273end
274
275local validatedutf = Cs (
276 (
277 patterns.utf8one
278 + patterns.utf8two
279 + patterns.utf8three
280 + patterns.utf8four
281 + P(1) / "�"
282 )^0
283)
284
285patterns.validatedutf = validatedutf
286
287function utf.is_valid(str)
288 return type(str) == "string" and lpegmatch(validatedutf,str) or false
289end
290
291if not utf.len then
292
293 utf.len = string.utflength or (utf8 and utf8.len)
294
295 if not utf.len then
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350 local n, f = 0, 1
351
352 local utfcharcounter = patterns.utfbom^-1 * Cmt (
353 Cc(1) * patterns.utf8one ^1
354 + Cc(2) * patterns.utf8two ^1
355 + Cc(3) * patterns.utf8three^1
356 + Cc(4) * patterns.utf8four ^1,
357 function(_,t,d)
358 n = n + (t - f)/d
359 f = t
360 return true
361 end
362 )^0
363
364 function utf.len(str)
365 n, f = 0, 1
366 lpegmatch(utfcharcounter,str or "")
367 return n
368 end
369
370
371
372
373
374
375 end
376
377end
378
379utf.length = utf.len
380
381if not utf.sub then
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428 local utflength = utf.length
429
430
431
432 local b, e, n, first, last = 0, 0, 0, 0, 0
433
434 local function slide_zero(s,p)
435 n = n + 1
436 if n >= last then
437 e = p - 1
438 else
439 return p
440 end
441 end
442
443 local function slide_one(s,p)
444 n = n + 1
445 if n == first then
446 b = p
447 end
448 if n >= last then
449 e = p - 1
450 else
451 return p
452 end
453 end
454
455 local function slide_two(s,p)
456 n = n + 1
457 if n == first then
458 b = p
459 else
460 return true
461 end
462 end
463
464 local pattern_zero = Cmt(p_utf8character,slide_zero)^0
465 local pattern_one = Cmt(p_utf8character,slide_one )^0
466 local pattern_two = Cmt(p_utf8character,slide_two )^0
467
468 local pattern_first = C(p_utf8character)
469
470 function utf.sub(str,start,stop)
471 if not start then
472 return str
473 end
474 if start == 0 then
475 start = 1
476 end
477 if not stop then
478 if start < 0 then
479 local l = utflength(str)
480 start = l + start
481 else
482 start = start - 1
483 end
484 b, n, first = 0, 0, start
485 lpegmatch(pattern_two,str)
486 if n >= first then
487 return sub(str,b)
488 else
489 return ""
490 end
491 end
492 if start < 0 or stop < 0 then
493 local l = utf.length(str)
494 if start < 0 then
495 start = l + start
496 if start <= 0 then
497 start = 1
498 else
499 start = start + 1
500 end
501 end
502 if stop < 0 then
503 stop = l + stop
504 if stop == 0 then
505 stop = 1
506 else
507 stop = stop + 1
508 end
509 end
510 end
511 if start == 1 and stop == 1 then
512 return lpegmatch(pattern_first,str) or ""
513 elseif start > stop then
514 return ""
515 elseif start > 1 then
516 b, e, n, first, last = 0, 0, 0, start - 1, stop
517 lpegmatch(pattern_one,str)
518 if n >= first and e == 0 then
519 e = #str
520 end
521 return sub(str,b,e)
522 else
523 b, e, n, last = 1, 0, 0, stop
524 lpegmatch(pattern_zero,str)
525 if e == 0 then
526 e = #str
527 end
528 return sub(str,b,e)
529 end
530 end
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559end
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574function utf.remapper(mapping,option,action)
575 local variant = type(mapping)
576 if variant == "table" then
577 action = action or mapping
578 if option == "dynamic" then
579 local pattern = false
580 table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
581 return function(str)
582 if not str or str == "" then
583 return ""
584 else
585 if not pattern then
586 pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
587 end
588 return lpegmatch(pattern,str)
589 end
590 end
591 elseif option == "pattern" then
592 return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
593
594 else
595 local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
596 return function(str)
597 if not str or str == "" then
598 return ""
599 else
600 return lpegmatch(pattern,str)
601 end
602 end, pattern
603 end
604 elseif variant == "function" then
605 if option == "pattern" then
606 return Cs((p_utf8character/mapping + p_utf8character)^0)
607 else
608 local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
609 return function(str)
610 if not str or str == "" then
611 return ""
612 else
613 return lpegmatch(pattern,str)
614 end
615 end, pattern
616 end
617 else
618
619 return function(str)
620 return str or ""
621 end
622 end
623end
624
625
626
627
628function utf.replacer(t)
629 local r = replacer(t,false,false,true)
630 return function(str)
631 return lpegmatch(r,str)
632 end
633end
634
635function utf.subtituter(t)
636 local f = finder (t)
637 local r = replacer(t,false,false,true)
638 return function(str)
639 local i = lpegmatch(f,str)
640 if not i then
641 return str
642 elseif i > #str then
643 return str
644 else
645
646 return lpegmatch(r,str)
647 end
648 end
649end
650
651
652
653
654local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
655local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
656local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
657local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
658
659patterns.utflinesplitter = utflinesplitter
660
661function utf.splitlines(str)
662 return lpegmatch(utflinesplitter,str or "")
663end
664
665function utf.split(str,ignorewhitespace)
666 if ignorewhitespace then
667 return lpegmatch(utfcharsplitter_iws,str or "")
668 else
669 return lpegmatch(utfcharsplitter_ows,str or "")
670 end
671end
672
673function utf.totable(str)
674 return lpegmatch(utfcharsplitter_raw,str)
675end
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721function utf.magic(f)
722 local str = f:read(4) or ""
723 local off = lpegmatch(p_utfoffset,str)
724 if off < 4 then
725 f:seek('set',off)
726 end
727 return lpegmatch(p_utftype,str)
728end
729
730local utf16_to_utf8_be, utf16_to_utf8_le
731local utf32_to_utf8_be, utf32_to_utf8_le
732
733local utf_16_be_getbom = patterns.utfbom_16_be^-1
734local utf_16_le_getbom = patterns.utfbom_16_le^-1
735local utf_32_be_getbom = patterns.utfbom_32_be^-1
736local utf_32_le_getbom = patterns.utfbom_32_le^-1
737
738local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
739local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
740local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
741local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924local more = 0
925
926local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
927 local now = 256*byte(left) + byte(right)
928 if more > 0 then
929 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
930 more = 0
931 return utfchar(now)
932 elseif now >= 0xD800 and now <= 0xDBFF then
933 more = now
934 return ""
935 else
936 return utfchar(now)
937 end
938end
939
940local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
941 local now = 256*byte(left) + byte(right)
942 if more > 0 then
943 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
944 more = 0
945 return utfchar(now)
946 elseif now >= 0xD800 and now <= 0xDBFF then
947 more = now
948 return ""
949 else
950 return utfchar(now)
951 end
952end
953local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
954 return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
955end
956
957local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
958 return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
959end
960
961p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
962p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
963p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
964p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
965
966patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
967patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
968patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
969patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
970
971utf16_to_utf8_be = function(s)
972 if s and s ~= "" then
973 return lpegmatch(p_utf16_to_utf8_be,s)
974 else
975 return s
976 end
977end
978
979local utf16_to_utf8_be_t = function(t)
980 if not t then
981 return nil
982 elseif type(t) == "string" then
983 t = lpegmatch(utf_16_be_linesplitter,t)
984 end
985 for i=1,#t do
986 local s = t[i]
987 if s ~= "" then
988 t[i] = lpegmatch(p_utf16_to_utf8_be,s)
989 end
990 end
991 return t
992end
993
994utf16_to_utf8_le = function(s)
995 if s and s ~= "" then
996 return lpegmatch(p_utf16_to_utf8_le,s)
997 else
998 return s
999 end
1000end
1001
1002local utf16_to_utf8_le_t = function(t)
1003 if not t then
1004 return nil
1005 elseif type(t) == "string" then
1006 t = lpegmatch(utf_16_le_linesplitter,t)
1007 end
1008 for i=1,#t do
1009 local s = t[i]
1010 if s ~= "" then
1011 t[i] = lpegmatch(p_utf16_to_utf8_le,s)
1012 end
1013 end
1014 return t
1015end
1016
1017utf32_to_utf8_be = function(s)
1018 if s and s ~= "" then
1019 return lpegmatch(p_utf32_to_utf8_be,s)
1020 else
1021 return s
1022 end
1023end
1024
1025local utf32_to_utf8_be_t = function(t)
1026 if not t then
1027 return nil
1028 elseif type(t) == "string" then
1029 t = lpegmatch(utf_32_be_linesplitter,t)
1030 end
1031 for i=1,#t do
1032 local s = t[i]
1033 if s ~= "" then
1034 t[i] = lpegmatch(p_utf32_to_utf8_be,s)
1035 end
1036 end
1037 return t
1038end
1039
1040utf32_to_utf8_le = function(s)
1041 if s and s ~= "" then
1042 return lpegmatch(p_utf32_to_utf8_le,s)
1043 else
1044 return s
1045 end
1046end
1047
1048local utf32_to_utf8_le_t = function(t)
1049 if not t then
1050 return nil
1051 elseif type(t) == "string" then
1052 t = lpegmatch(utf_32_le_linesplitter,t)
1053 end
1054 for i=1,#t do
1055 local s = t[i]
1056 if s ~= "" then
1057 t[i] = lpegmatch(p_utf32_to_utf8_le,s)
1058 end
1059 end
1060 return t
1061end
1062
1063utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
1064utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
1065utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
1066utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
1067
1068utf.utf16_to_utf8_le = utf16_to_utf8_le
1069utf.utf16_to_utf8_be = utf16_to_utf8_be
1070utf.utf32_to_utf8_le = utf32_to_utf8_le
1071utf.utf32_to_utf8_be = utf32_to_utf8_be
1072
1073function utf.utf8_to_utf8_t(t)
1074 return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
1075end
1076
1077function utf.utf16_to_utf8_t(t,endian)
1078 return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
1079end
1080
1081function utf.utf32_to_utf8_t(t,endian)
1082 return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
1083end
1084
1085if bit32 then
1086
1087 local rshift = bit32.rshift
1088
1089 local function little(b)
1090 if b < 0x10000 then
1091 return char(b%256,rshift(b,8))
1092 else
1093 b = b - 0x10000
1094 local b1 = rshift(b,10) + 0xD800
1095 local b2 = b%1024 + 0xDC00
1096 return char(b1%256,rshift(b1,8),b2%256,rshift(b2,8))
1097 end
1098 end
1099
1100 local function big(b)
1101 if b < 0x10000 then
1102 return char(rshift(b,8),b%256)
1103 else
1104 b = b - 0x10000
1105 local b1 = rshift(b,10) + 0xD800
1106 local b2 = b%1024 + 0xDC00
1107 return char(rshift(b1,8),b1%256,rshift(b2,8),b2%256)
1108 end
1109 end
1110
1111 local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
1112 local b_remap = Cs((p_utf8byte/big +P(1)/"")^0)
1113
1114 local function utf8_to_utf16_be(str,nobom)
1115 if nobom then
1116 return lpegmatch(b_remap,str)
1117 else
1118 return char(254,255) .. lpegmatch(b_remap,str)
1119 end
1120 end
1121
1122 local function utf8_to_utf16_le(str,nobom)
1123 if nobom then
1124 return lpegmatch(l_remap,str)
1125 else
1126 return char(255,254) .. lpegmatch(l_remap,str)
1127 end
1128 end
1129
1130 utf.utf8_to_utf16_be = utf8_to_utf16_be
1131 utf.utf8_to_utf16_le = utf8_to_utf16_le
1132
1133 function utf.utf8_to_utf16(str,littleendian,nobom)
1134 if littleendian then
1135 return utf8_to_utf16_le(str,nobom)
1136 else
1137 return utf8_to_utf16_be(str,nobom)
1138 end
1139 end
1140
1141end
1142
1143local pattern = Cs (
1144 (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
1145 (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
1146)
1147
1148function utf.tocodes(str,separator)
1149 return lpegmatch(pattern,str,1,separator or " ")
1150end
1151
1152function utf.ustring(s)
1153 return format("U+%05X",type(s) == "number" and s or utfbyte(s))
1154end
1155
1156function utf.xstring(s)
1157 return format("0x%05X",type(s) == "number" and s or utfbyte(s))
1158end
1159
1160function utf.toeight(str)
1161 if not str or str == "" then
1162 return nil
1163 end
1164 local utftype = lpegmatch(p_utfstricttype,str)
1165 if utftype == "utf-8" then
1166 return sub(str,4)
1167 elseif utftype == "utf-16-be" then
1168 return utf16_to_utf8_be(str)
1169 elseif utftype == "utf-16-le" then
1170 return utf16_to_utf8_le(str)
1171 else
1172 return str
1173 end
1174end
1175
1176
1177
1178do
1179
1180 local p_nany = p_utf8character / ""
1181 local cache = { }
1182
1183 function utf.count(str,what)
1184 if type(what) == "string" then
1185 local p = cache[what]
1186 if not p then
1187 p = Cs((P(what)/" " + p_nany)^0)
1188 cache[p] = p
1189 end
1190 return #lpegmatch(p,str)
1191 else
1192 return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
1193 end
1194 end
1195
1196end
1197
1198if not string.utfvalues then
1199
1200
1201
1202
1203
1204 local find = string.find
1205
1206 local dummy = function()
1207
1208 end
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225 function string.utfvalues(str)
1226 local n = #str
1227 if n == 0 then
1228 return dummy
1229 elseif n == 1 then
1230 return function() return utfbyte(str) end
1231 else
1232 local p = 1
1233
1234 return function()
1235
1236 local b, e = find(str,".[\128-\191]*",p)
1237 if b then
1238 p = e + 1
1239 return utfbyte(sub(str,b,e))
1240 end
1241
1242 end
1243 end
1244 end
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271end
1272
1273utf.values = string.utfvalues
1274
1275function utf.chrlen(u)
1276 return
1277 (u < 0x80 and 1) or
1278 (u < 0xE0 and 2) or
1279 (u < 0xF0 and 3) or
1280 (u < 0xF8 and 4) or
1281 (u < 0xFC and 5) or
1282 (u < 0xFE and 6) or 0
1283end
1284
1285
1286
1287
1288
1289if bit32 then
1290
1291 local extract = bit32.extract
1292 local char = string.char
1293
1294 function utf.toutf32string(n)
1295 if n <= 0xFF then
1296 return
1297 char(n) ..
1298 "\000\000\000"
1299 elseif n <= 0xFFFF then
1300 return
1301 char(extract(n, 0,8)) ..
1302 char(extract(n, 8,8)) ..
1303 "\000\000"
1304 elseif n <= 0xFFFFFF then
1305 return
1306 char(extract(n, 0,8)) ..
1307 char(extract(n, 8,8)) ..
1308 char(extract(n,16,8)) ..
1309 "\000"
1310 else
1311 return
1312 char(extract(n, 0,8)) ..
1313 char(extract(n, 8,8)) ..
1314 char(extract(n,16,8)) ..
1315 char(extract(n,24,8))
1316 end
1317 end
1318
1319end
1320
1321
1322
1323local len = utf.len
1324local rep = rep
1325
1326function string.utfpadd(s,n)
1327 if n and n ~= 0 then
1328 local l = len(s)
1329 if n > 0 then
1330 local d = n - l
1331 if d > 0 then
1332 return rep(c or " ",d) .. s
1333 end
1334 else
1335 local d = - n - l
1336 if d > 0 then
1337 return s .. rep(c or " ",d)
1338 end
1339 end
1340 end
1341 return s
1342end
1343
1344
1345
1346do
1347
1348 local utfcharacters = utf.characters or string.utfcharacters
1349 local utfchar = utf.char or string.utfcharacter
1350
1351 lpeg.UP = P
1352
1353 if utfcharacters then
1354
1355 function lpeg.US(str)
1356 local p = P(false)
1357 for uc in utfcharacters(str) do
1358 p = p + P(uc)
1359 end
1360 return p
1361 end
1362
1363 else
1364
1365 function lpeg.US(str)
1366 local p = P(false)
1367 local f = function(uc)
1368 p = p + P(uc)
1369 end
1370 lpegmatch((p_utf8char/f)^0,str)
1371 return p
1372 end
1373
1374 end
1375
1376 local range = p_utf8byte * p_utf8byte + Cc(false)
1377
1378 function lpeg.UR(str,more)
1379 local first, last
1380 if type(str) == "number" then
1381 first = str
1382 last = more or first
1383 else
1384 first, last = lpegmatch(range,str)
1385 if not last then
1386 return P(str)
1387 end
1388 end
1389 if first == last then
1390 return P(str)
1391 end
1392 if not utfchar then
1393 utfchar = utf.char
1394 end
1395 if utfchar and (last - first < 8) then
1396 local p = P(false)
1397 for i=first,last do
1398 p = p + P(utfchar(i))
1399 end
1400 return p
1401 else
1402 local f = function(b)
1403 return b >= first and b <= last
1404 end
1405
1406 return p_utf8byte / f
1407 end
1408 end
1409
1410
1411
1412end
1413 |