1if not modules then modules = { } end modules ['l-unicode'] = {
2 version = 1.001,
3 optimize = true,
4 comment = "companion to luat-lib.mkxl",
5 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6 copyright = "PRAGMA ADE / ConTeXt Development Team",
7 license = "see context related readme files"
8}
9
10
11
12
13
14
15
16
17
18
19
20
21
22utf = utf or { }
23unicode = nil
24
25local type = type
26local char, byte, format, sub, gmatch, rep = string.char, string.byte, string.format, string.sub, string.gmatch, string.rep
27local concat = table.concat
28local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
29
30local lpegmatch = lpeg.match
31local patterns = lpeg.patterns
32local tabletopattern = lpeg.utfchartabletopattern
33
34local finder = lpeg.finder
35local replacer = lpeg.replacer
36
37local p_utftype = patterns.utftype
38local p_utfstricttype = patterns.utfstricttype
39local p_utfoffset = patterns.utfoffset
40local p_utf8character = patterns.utf8character
41local p_utf8char = patterns.utf8char
42local p_utf8byte = patterns.utf8byte
43local p_utfbom = patterns.utfbom
44local p_newline = patterns.newline
45local p_whitespace = patterns.whitespace
46
47local utfchar = string.utfcharacter
48local utfbyte = string.utfvalue
49local utflength = string.utflength
50local utfcharacters = string.utfcharacters
51local utfbytepairs = string.bytepairs
52
53
54
55
56
57
58
59
60utf.char = utfchar
61utf.byte = utfbyte
62utf.len = utflength
63utf.length = utflength
64utf.characters = utfcharacters
65utf.bytepairs = utfbytepairs
66
67function utf.filetype(data)
68 return data and lpegmatch(p_utftype,data) or "unknown"
69end
70
71do
72
73 local toentities = Cs (
74 (
75 patterns.utf8one
76 + (
77 patterns.utf8two
78 + patterns.utf8three
79 + patterns.utf8four
80 ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
81 )^0
82 )
83
84 patterns.toentities = toentities
85
86 function utf.toentities(str)
87 return lpegmatch(toentities,str)
88 end
89
90end
91
92do
93
94 local one = P(1)
95 local two = C(1) * C(1)
96 local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
97
98 local pattern =
99 P("\254\255") * Cs( (
100 four / function(a,b,c,d)
101 local ab = 0xFF * byte(a) + byte(b)
102 local cd = 0xFF * byte(c) + byte(d)
103 return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
104 end
105 + two / function(a,b)
106 return utfchar(byte(a)*256 + byte(b))
107 end
108 + one
109 )^1 )
110 + P("\255\254") * Cs( (
111 four / function(b,a,d,c)
112 local ab = 0xFF * byte(a) + byte(b)
113 local cd = 0xFF * byte(c) + byte(d)
114 return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
115 end
116 + two / function(b,a)
117 return utfchar(byte(a)*256 + byte(b))
118 end
119 + one
120 )^1 )
121
122 function string.toutf(s)
123 return lpegmatch(pattern,s) or s
124 end
125
126end
127
128do
129
130 local validatedutf = Cs (
131 (
132 patterns.utf8one
133 + patterns.utf8two
134 + patterns.utf8three
135 + patterns.utf8four
136 + P(1) / "�"
137 )^0
138 )
139
140 patterns.validatedutf = validatedutf
141
142 function utf.is_valid(str)
143 return type(str) == "string" and lpegmatch(validatedutf,str) or false
144 end
145
146end
147
148if not utf.sub then
149
150
151
152 local b, e, n, first, last = 0, 0, 0, 0, 0
153
154 local function slide_zero(s,p)
155 n = n + 1
156 if n >= last then
157 e = p - 1
158 else
159 return p
160 end
161 end
162
163 local function slide_one(s,p)
164 n = n + 1
165 if n == first then
166 b = p
167 end
168 if n >= last then
169 e = p - 1
170 else
171 return p
172 end
173 end
174
175 local function slide_two(s,p)
176 n = n + 1
177 if n == first then
178 b = p
179 else
180 return true
181 end
182 end
183
184 local pattern_zero = Cmt(p_utf8character,slide_zero)^0
185 local pattern_one = Cmt(p_utf8character,slide_one )^0
186 local pattern_two = Cmt(p_utf8character,slide_two )^0
187
188 local pattern_first = C(p_utf8character)
189
190 function utf.sub(str,start,stop)
191 if not start then
192 return str
193 end
194 if start == 0 then
195 start = 1
196 end
197 if not stop then
198 if start < 0 then
199 local l = utflength(str)
200 start = l + start
201 else
202 start = start - 1
203 end
204 b, n, first = 0, 0, start
205 lpegmatch(pattern_two,str)
206 if n >= first then
207 return sub(str,b)
208 else
209 return ""
210 end
211 end
212 if start < 0 or stop < 0 then
213 local l = utf.length(str)
214 if start < 0 then
215 start = l + start
216 if start <= 0 then
217 start = 1
218 else
219 start = start + 1
220 end
221 end
222 if stop < 0 then
223 stop = l + stop
224 if stop == 0 then
225 stop = 1
226 else
227 stop = stop + 1
228 end
229 end
230 end
231 if start == 1 and stop == 1 then
232 return lpegmatch(pattern_first,str) or ""
233 elseif start > stop then
234 return ""
235 elseif start > 1 then
236 b, e, n, first, last = 0, 0, 0, start - 1, stop
237 lpegmatch(pattern_one,str)
238 if n >= first and e == 0 then
239 e = #str
240 end
241 return sub(str,b,e)
242 else
243 b, e, n, last = 1, 0, 0, stop
244 lpegmatch(pattern_zero,str)
245 if e == 0 then
246 e = #str
247 end
248 return sub(str,b,e)
249 end
250 end
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279end
280
281function utf.remapper(mapping,option,action)
282 local variant = type(mapping)
283 if variant == "table" then
284 action = action or mapping
285 if option == "dynamic" then
286 local pattern = false
287 table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
288 return function(str)
289 if not str or str == "" then
290 return ""
291 else
292 if not pattern then
293 pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
294 end
295 return lpegmatch(pattern,str)
296 end
297 end
298 elseif option == "pattern" then
299 return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
300
301 else
302 local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
303 return function(str)
304 if not str or str == "" then
305 return ""
306 else
307 return lpegmatch(pattern,str)
308 end
309 end, pattern
310 end
311 elseif variant == "function" then
312 if option == "pattern" then
313 return Cs((p_utf8character/mapping + p_utf8character)^0)
314 else
315 local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
316 return function(str)
317 if not str or str == "" then
318 return ""
319 else
320 return lpegmatch(pattern,str)
321 end
322 end, pattern
323 end
324 else
325
326 return function(str)
327 return str or ""
328 end
329 end
330end
331
332
333
334
335function utf.replacer(t)
336 local r = replacer(t,false,false,true)
337 return function(str)
338 return lpegmatch(r,str)
339 end
340end
341
342function utf.subtituter(t)
343 local f = finder (t)
344 local r = replacer(t,false,false,true)
345 return function(str)
346 local i = lpegmatch(f,str)
347 if not i then
348 return str
349 elseif i > #str then
350 return str
351 else
352
353 return lpegmatch(r,str)
354 end
355 end
356end
357
358
359
360
361local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline)
362local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
363local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
364local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
365
366patterns.utflinesplitter = utflinesplitter
367
368function utf.splitlines(str)
369 return lpegmatch(utflinesplitter,str or "")
370end
371
372function utf.split(str,ignorewhitespace)
373 if ignorewhitespace then
374 return lpegmatch(utfcharsplitter_iws,str or "")
375 else
376 return lpegmatch(utfcharsplitter_ows,str or "")
377 end
378end
379
380function utf.totable(str)
381 return lpegmatch(utfcharsplitter_raw,str)
382end
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400function utf.magic(f)
401 local str = f:read(4) or ""
402 local off = lpegmatch(p_utfoffset,str)
403 if off < 4 then
404 f:seek('set',off)
405 end
406 return lpegmatch(p_utftype,str)
407end
408
409local utf_16_be_getbom = patterns.utfbom_16_be^-1
410local utf_16_le_getbom = patterns.utfbom_16_le^-1
411local utf_32_be_getbom = patterns.utfbom_32_be^-1
412local utf_32_le_getbom = patterns.utfbom_32_le^-1
413
414local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
415local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
416local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
417local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
418
419local more = 0
420
421local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
422 local now = 256*byte(left) + byte(right)
423 if more > 0 then
424 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
425 more = 0
426 return utfchar(now)
427 elseif now >= 0xD800 and now <= 0xDBFF then
428 more = now
429 return ""
430 else
431 return utfchar(now)
432 end
433end
434
435local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
436 local now = 256*byte(left) + byte(right)
437 if more > 0 then
438 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
439 more = 0
440 return utfchar(now)
441 elseif now >= 0xD800 and now <= 0xDBFF then
442 more = now
443 return ""
444 else
445 return utfchar(now)
446 end
447end
448
449local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
450 return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
451end
452
453local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
454 return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
455end
456
457p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
458p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
459p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
460p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
461
462patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
463patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
464patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
465patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
466
467local utf16_to_utf8_be = function(s)
468 if s and s ~= "" then
469 return lpegmatch(p_utf16_to_utf8_be,s)
470 else
471 return s
472 end
473end
474
475local utf16_to_utf8_be_t = function(t)
476 if not t then
477 return nil
478 elseif type(t) == "string" then
479 t = lpegmatch(utf_16_be_linesplitter,t)
480 end
481 for i=1,#t do
482 local s = t[i]
483 if s ~= "" then
484 t[i] = lpegmatch(p_utf16_to_utf8_be,s)
485 end
486 end
487 return t
488end
489
490local utf16_to_utf8_le = function(s)
491 if s and s ~= "" then
492 return lpegmatch(p_utf16_to_utf8_le,s)
493 else
494 return s
495 end
496end
497
498local utf16_to_utf8_le_t = function(t)
499 if not t then
500 return nil
501 elseif type(t) == "string" then
502 t = lpegmatch(utf_16_le_linesplitter,t)
503 end
504 for i=1,#t do
505 local s = t[i]
506 if s ~= "" then
507 t[i] = lpegmatch(p_utf16_to_utf8_le,s)
508 end
509 end
510 return t
511end
512
513local utf32_to_utf8_be = function(s)
514 if s and s ~= "" then
515 return lpegmatch(p_utf32_to_utf8_be,s)
516 else
517 return s
518 end
519end
520
521local utf32_to_utf8_be_t = function(t)
522 if not t then
523 return nil
524 elseif type(t) == "string" then
525 t = lpegmatch(utf_32_be_linesplitter,t)
526 end
527 for i=1,#t do
528 local s = t[i]
529 if s ~= "" then
530 t[i] = lpegmatch(p_utf32_to_utf8_be,s)
531 end
532 end
533 return t
534end
535
536local utf32_to_utf8_le = function(s)
537 if s and s ~= "" then
538 return lpegmatch(p_utf32_to_utf8_le,s)
539 else
540 return s
541 end
542end
543
544local utf32_to_utf8_le_t = function(t)
545 if not t then
546 return nil
547 elseif type(t) == "string" then
548 t = lpegmatch(utf_32_le_linesplitter,t)
549 end
550 for i=1,#t do
551 local s = t[i]
552 if s ~= "" then
553 t[i] = lpegmatch(p_utf32_to_utf8_le,s)
554 end
555 end
556 return t
557end
558
559utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
560utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
561utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
562utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
563
564utf.utf16_to_utf8_le = utf16_to_utf8_le
565utf.utf16_to_utf8_be = utf16_to_utf8_be
566utf.utf32_to_utf8_le = utf32_to_utf8_le
567utf.utf32_to_utf8_be = utf32_to_utf8_be
568
569function utf.utf8_to_utf8_t(t)
570 return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
571end
572
573function utf.utf16_to_utf8_t(t,endian)
574 return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
575end
576
577function utf.utf32_to_utf8_t(t,endian)
578 return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
579end
580
581do
582
583 local function little(b)
584 if b < 0x10000 then
585 return char(b%256,(b>>8))
586 else
587 b = b - 0x10000
588 local b1 = (b>>10) + 0xD800
589 local b2 = b%1024 + 0xDC00
590 return char(b1%256,(b1>>8),b2%256,(b2>>8))
591 end
592 end
593
594 local function big(b)
595 if b < 0x10000 then
596 return char((b>>8),b%256)
597 else
598 b = b - 0x10000
599 local b1 = (b>>10) + 0xD800
600 local b2 = b%1024 + 0xDC00
601 return char((b1>>8),b1%256,(b2>>8),b2%256)
602 end
603 end
604
605 local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
606 local b_remap = Cs((p_utf8byte/big +P(1)/"")^0)
607
608 local function utf8_to_utf16_be(str,nobom)
609 if nobom then
610 return lpegmatch(b_remap,str)
611 else
612 return char(254,255) .. lpegmatch(b_remap,str)
613 end
614 end
615
616 local function utf8_to_utf16_le(str,nobom)
617 if nobom then
618 return lpegmatch(l_remap,str)
619 else
620 return char(255,254) .. lpegmatch(l_remap,str)
621 end
622 end
623
624 utf.utf8_to_utf16_be = utf8_to_utf16_be
625 utf.utf8_to_utf16_le = utf8_to_utf16_le
626
627 function utf.utf8_to_utf16(str,littleendian,nobom)
628 if littleendian then
629 return utf8_to_utf16_le(str,nobom)
630 else
631 return utf8_to_utf16_be(str,nobom)
632 end
633 end
634
635end
636
637local pattern = Cs (
638 (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) *
639 (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
640)
641
642function utf.tocodes(str,separator)
643 return lpegmatch(pattern,str,1,separator or " ")
644end
645
646function utf.ustring(s)
647 return format("U+%05X",type(s) == "number" and s or utfbyte(s))
648end
649
650function utf.xstring(s)
651 return format("0x%05X",type(s) == "number" and s or utfbyte(s))
652end
653
654function utf.toeight(str)
655 if not str or str == "" then
656 return nil
657 end
658 local utftype = lpegmatch(p_utfstricttype,str)
659 if utftype == "utf-8" then
660 return sub(str,4)
661 elseif utftype == "utf-16-be" then
662 return utf16_to_utf8_be(str)
663 elseif utftype == "utf-16-le" then
664 return utf16_to_utf8_le(str)
665 else
666 return str
667 end
668end
669
670do
671
672 local p_nany = p_utf8character / ""
673 local cache = { }
674
675 function utf.count(str,what)
676 if type(what) == "string" then
677 local p = cache[what]
678 if not p then
679 p = Cs((P(what)/" " + p_nany)^0)
680 cache[p] = p
681 end
682 return #lpegmatch(p,str)
683 else
684 return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
685 end
686 end
687
688end
689
690utf.values = string.utfvalues
691
692function utf.chrlen(u)
693 return
694 (u < 0x80 and 1) or
695 (u < 0xE0 and 2) or
696 (u < 0xF0 and 3) or
697 (u < 0xF8 and 4) or
698 (u < 0xFC and 5) or
699 (u < 0xFE and 6) or 0
700end
701
702
703
704
705
706
707
708
709
710
711
712function string.utfpadd(s,n)
713 if n and n ~= 0 then
714 local l = utflength(s)
715 if n > 0 then
716 local d = n - l
717 if d > 0 then
718 return rep(c or " ",d) .. s
719 end
720 else
721 local d = - n - l
722 if d > 0 then
723 return s .. rep(c or " ",d)
724 end
725 end
726 end
727 return s
728end
729
730
731
732do
733
734 lpeg.UP = P
735
736 function lpeg.US(str)
737 local p = P(false)
738 for uc in utfcharacters(str) do
739 p = p + P(uc)
740 end
741 return p
742 end
743
744 local range = p_utf8byte * p_utf8byte + Cc(false)
745
746 function lpeg.UR(str,more)
747 local first, last
748 if type(str) == "number" then
749 first = str
750 last = more or first
751 else
752 first, last = lpegmatch(range,str)
753 if not last then
754 return P(str)
755 end
756 end
757 if first == last then
758 return P(str)
759 end
760 if not utfchar then
761 utfchar = utf.char
762 end
763 if utfchar and (last - first < 8) then
764 local p = P(false)
765 for i=first,last do
766 p = p + P(utfchar(i))
767 end
768 return p
769 else
770 local f = function(b)
771 return b >= first and b <= last
772 end
773
774 return p_utf8byte / f
775 end
776 end
777
778
779
780end
781 |