SourceBrowser

l-unicode.lua /size: 40 Kb last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['l-unicode'] = {
2    version   = 1.001,
3    optimize  = true,
4    comment   = "companion to luat-lib.mkiv",
5    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6    copyright = "PRAGMA ADE / ConTeXt Development Team",
7    license   = "see context related readme files"
8}
9
10-- floor(b/256)  => rshift(b, 8)
11-- floor(b/1024) => rshift(b,10)
12
13-- in lua 5.3:
14
15-- utf8.char(···)         : concatinated
16-- utf8.charpatt          : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
17-- utf8.codes(s)          : for p, c in utf8.codes(s) do body end
18-- utf8.codepoint(s [, i [, j]])
19-- utf8.len(s [, i])
20-- utf8.offset(s, n [, i])
21
22-- todo: utf.sub replacement (used in syst-aux)
23-- we put these in the utf namespace:
24
25-- used     : byte char len lower sub upper
26-- not used : dump find format gmatch gfind gsub match rep reverse
27
28-- utf  = utf or (unicode and unicode.utf8) or { }
29
30-- not supported:
31--
32-- dump, find, format, gfind, gmatch, gsub, lower, match, rep, reverse, upper
33
34utf     = utf or { }
35unicode = nil
36
37if not string.utfcharacters then
38
39    -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower
40    -- than the built-in string.utfcharacters.
41
42    local gmatch = string.gmatch
43
44    function string.characters(str)
45        return gmatch(str,".[\128-\191]*")
46    end
47
48
49end
50
51utf.characters = string.utfcharacters
52
53-- string.utfvalues
54-- string.utfcharacters
55-- string.characters
56-- string.characterpairs
57-- string.bytes
58-- string.bytepairs
59-- string.utflength
60-- string.utfvalues
61-- string.utfcharacters
62
63local type = type
64local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch
65local concat = table.concat
66local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
67
68local lpegmatch       = lpeg.match
69local patterns        = lpeg.patterns
70local tabletopattern  = lpeg.utfchartabletopattern
71
72local bytepairs       = string.bytepairs
73
74local finder          = lpeg.finder
75local replacer        = lpeg.replacer
76
77local p_utftype       = patterns.utftype
78local p_utfstricttype = patterns.utfstricttype
79local p_utfoffset     = patterns.utfoffset
80local p_utf8character = patterns.utf8character
81local p_utf8char      = patterns.utf8char
82local p_utf8byte      = patterns.utf8byte
83local p_utfbom        = patterns.utfbom
84local p_newline       = patterns.newline
85local p_whitespace    = patterns.whitespace
86
87-- if not unicode then
88--     unicode = { utf = utf } -- for a while
89-- end
90
91if not utf.char then
92
93    utf.char = string.utfcharacter or (utf8 and utf8.char)
94
95    if not utf.char then
96
97        -- no multiples
98
99        local char = string.char
100
101        if bit32 then
102
103            local rshift  = bit32.rshift
104
105            function utf.char(n)
106                if n < 0x80 then
107                    -- 0aaaaaaa : 0x80
108                    return char(n)
109                elseif n < 0x800 then
110                    -- 110bbbaa : 0xC0 : n >> 6
111                    -- 10aaaaaa : 0x80 : n & 0x3F
112                    return char(
113                        0xC0 + rshift(n,6),
114                        0x80 + (n % 0x40)
115                    )
116                elseif n < 0x10000 then
117                    -- 1110bbbb : 0xE0 :  n >> 12
118                    -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
119                    -- 10aaaaaa : 0x80 :  n        & 0x3F
120                    return char(
121                        0xE0 + rshift(n,12),
122                        0x80 + (rshift(n,6) % 0x40),
123                        0x80 + (n % 0x40)
124                    )
125                elseif n < 0x200000 then
126                    -- 11110ccc : 0xF0 :  n >> 18
127                    -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F
128                    -- 10bbbbaa : 0x80 : (n >>  6) & 0x3F
129                    -- 10aaaaaa : 0x80 :  n        & 0x3F
130                    -- dddd     : ccccc - 1
131                    return char(
132                        0xF0 +  rshift(n,18),
133                        0x80 + (rshift(n,12) % 0x40),
134                        0x80 + (rshift(n,6) % 0x40),
135                        0x80 + (n % 0x40)
136                    )
137                else
138                    return ""
139                end
140            end
141
142        else
143
144            local floor = math.floor
145
146            function utf.char(n)
147                if n < 0x80 then
148                    return char(n)
149                elseif n < 0x800 then
150                    return char(
151                        0xC0 + floor(n/0x40),
152                        0x80 + (n % 0x40)
153                    )
154                elseif n < 0x10000 then
155                    return char(
156                        0xE0 + floor(n/0x1000),
157                        0x80 + (floor(n/0x40) % 0x40),
158                        0x80 + (n % 0x40)
159                    )
160                elseif n < 0x200000 then
161                    return char(
162                        0xF0 +  floor(n/0x40000),
163                        0x80 + (floor(n/0x1000) % 0x40),
164                        0x80 + (floor(n/0x40) % 0x40),
165                        0x80 + (n % 0x40)
166                    )
167                else
168                    return ""
169                end
170            end
171
172        end
173
174    end
175
176end
177
178if not utf.byte then
179
180    utf.byte = string.utfvalue or (utf8 and utf8.codepoint)
181
182    if not utf.byte then
183
184        function utf.byte(c)
185            return lpegmatch(p_utf8byte,c)
186        end
187
188    end
189
190end
191
192local utfchar, utfbyte = utf.char, utf.byte
193
194-- As we want to get rid of the (unmaintained) utf library we implement our own
195-- variants (in due time an independent module):
196
197function utf.filetype(data)
198    return data and lpegmatch(p_utftype,data) or "unknown"
199end
200
201local toentities = Cs (
202    (
203        patterns.utf8one
204            + (
205                patterns.utf8two
206              + patterns.utf8three
207              + patterns.utf8four
208            ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
209    )^0
210)
211
212patterns.toentities = toentities
213
214function utf.toentities(str)
215    return lpegmatch(toentities,str)
216end
217
218-- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin)
219--
220-- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } )
221--
222-- collectgarbage("collect")
223-- local u = collectgarbage("count")*1024
224-- local t = os.clock()
225-- for i=1,1000 do
226--     for i=1,600 do
227--         local a = utfchr[i]
228--     end
229-- end
230-- print(os.clock()-t,collectgarbage("count")*1024-u)
231
232-- collectgarbage("collect")
233-- local t = os.clock()
234-- for i=1,1000 do
235--     for i=1,600 do
236--         local a = utfchar(i)
237--     end
238-- end
239-- print(os.clock()-t,collectgarbage("count")*1024-u)
240
241-- local byte = string.byte
242-- local utfchar = utf.char
243
244local one  = P(1)
245local two  = C(1) * C(1)
246local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
247
248local pattern = P("\254\255") * Cs( (
249                    four  / function(a,b,c,d)
250                                local ab = 0xFF * byte(a) + byte(b)
251                                local cd = 0xFF * byte(c) + byte(d)
252                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
253                            end
254                  + two   / function(a,b)
255                                return utfchar(byte(a)*256 + byte(b))
256                            end
257                  + one
258                )^1 )
259              + P("\255\254") * Cs( (
260                    four  / function(b,a,d,c)
261                                local ab = 0xFF * byte(a) + byte(b)
262                                local cd = 0xFF * byte(c) + byte(d)
263                                return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
264                            end
265                  + two   / function(b,a)
266                                return utfchar(byte(a)*256 + byte(b))
267                            end
268                  + one
269                )^1 )
270
271function string.toutf(s) -- in string namespace
272    return lpegmatch(pattern,s) or s -- todo: utf32
273end
274
275local validatedutf = Cs (
276    (
277        patterns.utf8one
278      + patterns.utf8two
279      + patterns.utf8three
280      + patterns.utf8four
281      + P(1) / "�"
282    )^0
283)
284
285patterns.validatedutf = validatedutf
286
287function utf.is_valid(str)
288    return type(str) == "string" and lpegmatch(validatedutf,str) or false
289end
290
291if not utf.len then
292
293    utf.len = string.utflength or (utf8 and utf8.len)
294
295    if not utf.len then
296
297        -- -- alternative 1: 0.77
298        --
299        -- local utfcharcounter = utfbom^-1 * Cs((p_utf8character/'!')^0)
300        --
301        -- function utf.len(str)
302        --     return #lpegmatch(utfcharcounter,str or "")
303        -- end
304        --
305        -- -- alternative 2: 1.70
306        --
307        -- local n = 0
308        --
309        -- local utfcharcounter = utfbom^-1 * (p_utf8character/function() n = n + 1 end)^0 -- slow
310        --
311        -- function utf.length(str)
312        --     n = 0
313        --     lpegmatch(utfcharcounter,str or "")
314        --     return n
315        -- end
316        --
317        -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047)
318
319        -- local n = 0
320        --
321        -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * (
322        -- --     patterns.utf8one  ^1 * Cc(1)
323        -- --   + patterns.utf8two  ^1 * Cc(2)
324        -- --   + patterns.utf8three^1 * Cc(3)
325        -- --   + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end
326        -- --  )^0 ) -- just as many captures as below
327        --
328        -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( (
329        -- --     (Cmt(patterns.utf8one  ^1,function(_,_,s) n = n + #s   return true end))
330        -- --   + (Cmt(patterns.utf8two  ^1,function(_,_,s) n = n + #s/2 return true end))
331        -- --   + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end))
332        -- --   + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end))
333        -- -- )^0 ) -- not interesting as it creates strings but sometimes faster
334        --
335        -- -- The best so far:
336        --
337        -- local utfcharcounter = utfbom^-1 * P ( (
338        --     Cp() * (patterns.utf8one  )^1 * Cp() / function(f,t) n = n +  t - f    end
339        --   + Cp() * (patterns.utf8two  )^1 * Cp() / function(f,t) n = n + (t - f)/2 end
340        --   + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end
341        --   + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end
342        -- )^0 )
343
344        -- function utf.len(str)
345        --     n = 0
346        --     lpegmatch(utfcharcounter,str or "")
347        --     return n
348        -- end
349
350        local n, f = 0, 1
351
352        local utfcharcounter = patterns.utfbom^-1 * Cmt (
353            Cc(1) * patterns.utf8one  ^1
354          + Cc(2) * patterns.utf8two  ^1
355          + Cc(3) * patterns.utf8three^1
356          + Cc(4) * patterns.utf8four ^1,
357            function(_,t,d) -- due to Cc no string captures, so faster
358                n = n + (t - f)/d
359                f = t
360                return true
361            end
362        )^0
363
364        function utf.len(str)
365            n, f = 0, 1
366            lpegmatch(utfcharcounter,str or "")
367            return n
368        end
369
370        -- -- these are quite a bit slower:
371
372        -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower
373        -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower
374
375    end
376
377end
378
379utf.length = utf.len
380
381if not utf.sub then
382
383    -- inefficient as lpeg just copies ^n
384
385    -- local function sub(str,start,stop)
386    --     local pattern = p_utf8character^-(start-1) * C(p_utf8character^-(stop-start+1))
387    --     inspect(pattern)
388    --     return lpegmatch(pattern,str) or ""
389    -- end
390
391    -- local b, e, n, first, last = 0, 0, 0, 0, 0
392    --
393    -- local function slide(s,p)
394    --     n = n + 1
395    --     if n == first then
396    --         b = p
397    --         if not last then
398    --             return nil
399    --         end
400    --     end
401    --     if n == last then
402    --         e = p
403    --         return nil
404    --     else
405    --         return p
406    --     end
407    -- end
408    --
409    -- local pattern = Cmt(p_utf8character,slide)^0
410    --
411    -- function utf.sub(str,start,stop) -- todo: from the end
412    --     if not start then
413    --         return str
414    --     end
415    --     b, e, n, first, last = 0, 0, 0, start, stop
416    --     lpegmatch(pattern,str)
417    --     if not stop then
418    --         return sub(str,b)
419    --     else
420    --         return sub(str,b,e-1)
421    --     end
422    -- end
423
424    -- print(utf.sub("Hans Hagen is my name"))
425    -- print(utf.sub("Hans Hagen is my name",5))
426    -- print(utf.sub("Hans Hagen is my name",5,10))
427
428    local utflength = utf.length
429
430    -- also negative indices, upto 10 times slower than a c variant
431
432    local b, e, n, first, last = 0, 0, 0, 0, 0
433
434    local function slide_zero(s,p)
435        n = n + 1
436        if n >= last then
437            e = p - 1
438        else
439            return p
440        end
441    end
442
443    local function slide_one(s,p)
444        n = n + 1
445        if n == first then
446            b = p
447        end
448        if n >= last then
449            e = p - 1
450        else
451            return p
452        end
453    end
454
455    local function slide_two(s,p)
456        n = n + 1
457        if n == first then
458            b = p
459        else
460            return true
461        end
462    end
463
464    local pattern_zero  = Cmt(p_utf8character,slide_zero)^0
465    local pattern_one   = Cmt(p_utf8character,slide_one )^0
466    local pattern_two   = Cmt(p_utf8character,slide_two )^0
467
468    local pattern_first = C(p_utf8character)
469
470    function utf.sub(str,start,stop)
471        if not start then
472            return str
473        end
474        if start == 0 then
475            start = 1
476        end
477        if not stop then
478            if start < 0 then
479                local l = utflength(str) -- we can inline this function if needed
480                start = l + start
481            else
482                start = start - 1
483            end
484            b, n, first = 0, 0, start
485            lpegmatch(pattern_two,str)
486            if n >= first then
487                return sub(str,b)
488            else
489                return ""
490            end
491        end
492        if start < 0 or stop < 0 then
493            local l = utf.length(str)
494            if start < 0 then
495                start = l + start
496                if start <= 0 then
497                    start = 1
498                else
499                    start = start + 1
500                end
501            end
502            if stop < 0 then
503                stop = l + stop
504                if stop == 0 then
505                    stop = 1
506                else
507                    stop = stop + 1
508                end
509            end
510        end
511        if start == 1 and stop == 1 then
512            return lpegmatch(pattern_first,str) or ""
513        elseif start > stop then
514            return ""
515        elseif start > 1 then
516            b, e, n, first, last = 0, 0, 0, start - 1, stop
517            lpegmatch(pattern_one,str)
518            if n >= first and e == 0 then
519                e = #str
520            end
521            return sub(str,b,e)
522        else
523            b, e, n, last = 1, 0, 0, stop
524            lpegmatch(pattern_zero,str)
525            if e == 0 then
526                e = #str
527            end
528            return sub(str,b,e)
529        end
530    end
531
532    -- local n = 100000
533    -- local str = string.rep("123456àáâãäå",100)
534    --
535    -- for i=-15,15,1 do
536    --     for j=-15,15,1 do
537    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
538    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
539    --         end
540    --     end
541    --     if utf.xsub(str,i) ~= utf.sub(str,i) then
542    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
543    --     end
544    -- end
545
546    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
547    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
548    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
549    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   ))
550    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   ))
551    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
552    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
553    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
554    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
555    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
556    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
557    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   ))
558
559end
560
561-- a replacement for simple gsubs:
562
563-- function utf.remapper(mapping)
564--     local pattern = Cs((p_utf8character/mapping)^0)
565--     return function(str)
566--         if not str or str == "" then
567--             return ""
568--         else
569--             return lpegmatch(pattern,str)
570--         end
571--     end, pattern
572-- end
573
574function utf.remapper(mapping,option,action) -- static also returns a pattern
575    local variant = type(mapping)
576    if variant == "table" then
577        action = action or mapping
578        if option == "dynamic" then
579            local pattern = false
580            table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
581            return function(str)
582                if not str or str == "" then
583                    return ""
584                else
585                    if not pattern then
586                        pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
587                    end
588                    return lpegmatch(pattern,str)
589                end
590            end
591        elseif option == "pattern" then
592            return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
593     -- elseif option == "static" then
594        else
595            local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
596            return function(str)
597                if not str or str == "" then
598                    return ""
599                else
600                    return lpegmatch(pattern,str)
601                end
602            end, pattern
603        end
604    elseif variant == "function" then
605        if option == "pattern" then
606            return Cs((p_utf8character/mapping + p_utf8character)^0)
607        else
608            local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
609            return function(str)
610                if not str or str == "" then
611                    return ""
612                else
613                    return lpegmatch(pattern,str)
614                end
615            end, pattern
616        end
617    else
618        -- is actually an error
619        return function(str)
620            return str or ""
621        end
622    end
623end
624
625-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
626-- print(remap("abcd 1234 abcd"))
627
628function utf.replacer(t) -- no precheck, always string builder
629    local r = replacer(t,false,false,true)
630    return function(str)
631        return lpegmatch(r,str)
632    end
633end
634
635function utf.subtituter(t) -- with precheck and no building if no match
636    local f = finder  (t)
637    local r = replacer(t,false,false,true)
638    return function(str)
639        local i = lpegmatch(f,str)
640        if not i then
641            return str
642        elseif i > #str then
643            return str
644        else
645         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
646            return lpegmatch(r,str)
647        end
648    end
649end
650
651-- inspect(utf.split("a b c d"))
652-- inspect(utf.split("a b c d",true))
653
654local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline)
655local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
656local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
657local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
658
659patterns.utflinesplitter  = utflinesplitter
660
661function utf.splitlines(str)
662    return lpegmatch(utflinesplitter,str or "")
663end
664
665function utf.split(str,ignorewhitespace) -- new
666    if ignorewhitespace then
667        return lpegmatch(utfcharsplitter_iws,str or "")
668    else
669        return lpegmatch(utfcharsplitter_ows,str or "")
670    end
671end
672
673function utf.totable(str) -- keeps bom
674    return lpegmatch(utfcharsplitter_raw,str)
675end
676
677-- 0  EF BB BF      UTF-8
678-- 1  FF FE         UTF-16-little-endian
679-- 2  FE FF         UTF-16-big-endian
680-- 3  FF FE 00 00   UTF-32-little-endian
681-- 4  00 00 FE FF   UTF-32-big-endian
682--
683-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
684
685-- utf.name = {
686--     [0] = 'utf-8',
687--     [1] = 'utf-16-le',
688--     [2] = 'utf-16-be',
689--     [3] = 'utf-32-le',
690--     [4] = 'utf-32-be'
691-- }
692--
693-- function utf.magic(f)
694--     local str = f:read(4)
695--     if not str then
696--         f:seek('set')
697--         return 0
698--  -- elseif find(str,"^%z%z\254\255") then            -- depricated
699--  -- elseif find(str,"^\000\000\254\255") then        -- not permitted and bugged
700--     elseif find(str,"\000\000\254\255",1,true) then  -- seems to work okay (TH)
701--         return 4
702--  -- elseif find(str,"^\255\254%z%z") then            -- depricated
703--  -- elseif find(str,"^\255\254\000\000") then        -- not permitted and bugged
704--     elseif find(str,"\255\254\000\000",1,true) then  -- seems to work okay (TH)
705--         return 3
706--     elseif find(str,"^\254\255") then
707--         f:seek('set',2)
708--         return 2
709--     elseif find(str,"^\255\254") then
710--         f:seek('set',2)
711--         return 1
712--     elseif find(str,"^\239\187\191") then
713--         f:seek('set',3)
714--         return 0
715--     else
716--         f:seek('set')
717--         return 0
718--     end
719-- end
720
721function utf.magic(f) -- not used
722    local str = f:read(4) or ""
723    local off = lpegmatch(p_utfoffset,str)
724    if off < 4 then
725        f:seek('set',off)
726    end
727    return lpegmatch(p_utftype,str)
728end
729
730local utf16_to_utf8_be, utf16_to_utf8_le
731local utf32_to_utf8_be, utf32_to_utf8_le
732
733local utf_16_be_getbom = patterns.utfbom_16_be^-1
734local utf_16_le_getbom = patterns.utfbom_16_le^-1
735local utf_32_be_getbom = patterns.utfbom_32_be^-1
736local utf_32_le_getbom = patterns.utfbom_32_le^-1
737
738local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
739local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
740local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
741local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
742
743-- we have three possibilities: bytepairs (using tables), gmatch (using tables), gsub and
744-- lpeg. Bytepairs are the fastert but as soon as we need to remove bombs and so the gain
745-- is less due to more testing. Also, we seldom have to convert utf16 so we don't care to
746-- much about a few  milliseconds more runtime. The lpeg variant is upto 20% slower but
747-- still pretty fast.
748--
749-- for historic resone we keep the bytepairs variants around .. beware they don't grab the
750-- bom like the lpegs do so they're not dropins in the functions that follow
751--
752-- utf16_to_utf8_be = function(s)
753--     if not s then
754--         return nil
755--     elseif s == "" then
756--         return ""
757--     end
758--     local result, r, more = { }, 0, 0
759--     for left, right in bytepairs(s) do
760--         if right then
761--             local now = 256*left + right
762--             if more > 0 then
763--                 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
764--                 more = 0
765--                 r = r + 1
766--                 result[r] = utfchar(now)
767--             elseif now >= 0xD800 and now <= 0xDBFF then
768--                 more = now
769--             else
770--                 r = r + 1
771--                 result[r] = utfchar(now)
772--             end
773--         end
774--     end
775--     return concat(result)
776-- end
777--
778-- local utf16_to_utf8_be_t = function(t)
779--     if not t then
780--         return nil
781--     elseif type(t) == "string" then
782--         t = lpegmatch(utf_16_be_linesplitter,t)
783--     end
784--     local result = { } -- we reuse result
785--     for i=1,#t do
786--         local s = t[i]
787--         if s ~= "" then
788--             local r, more = 0, 0
789--             for left, right in bytepairs(s) do
790--                 if right then
791--                     local now = 256*left + right
792--                     if more > 0 then
793--                         now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
794--                         more = 0
795--                         r = r + 1
796--                         result[r] = utfchar(now)
797--                     elseif now >= 0xD800 and now <= 0xDBFF then
798--                         more = now
799--                     else
800--                         r = r + 1
801--                         result[r] = utfchar(now)
802--                     end
803--                 end
804--             end
805--             t[i] = concat(result,"",1,r) -- we reused tmp, hence t
806--         end
807--     end
808--     return t
809-- end
810--
811-- utf16_to_utf8_le = function(s)
812--     if not s then
813--         return nil
814--     elseif s == "" then
815--         return ""
816--     end
817--     local result, r, more = { }, 0, 0
818--     for left, right in bytepairs(s) do
819--         if right then
820--             local now = 256*right + left
821--             if more > 0 then
822--                 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
823--                 more = 0
824--                 r = r + 1
825--                 result[r] = utfchar(now)
826--             elseif now >= 0xD800 and now <= 0xDBFF then
827--                 more = now
828--             else
829--                 r = r + 1
830--                 result[r] = utfchar(now)
831--             end
832--         end
833--     end
834--     return concat(result)
835-- end
836--
837-- local utf16_to_utf8_le_t = function(t)
838--     if not t then
839--         return nil
840--     elseif type(t) == "string" then
841--         t = lpegmatch(utf_16_le_linesplitter,t)
842--     end
843--     local result = { } -- we reuse result
844--     for i=1,#t do
845--         local s = t[i]
846--         if s ~= "" then
847--             local r, more = 0, 0
848--             for left, right in bytepairs(s) do
849--                 if right then
850--                     local now = 256*right + left
851--                     if more > 0 then
852--                         now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
853--                         more = 0
854--                         r = r + 1
855--                         result[r] = utfchar(now)
856--                     elseif now >= 0xD800 and now <= 0xDBFF then
857--                         more = now
858--                     else
859--                         r = r + 1
860--                         result[r] = utfchar(now)
861--                     end
862--                 end
863--             end
864--             t[i] = concat(result,"",1,r) -- we reused tmp, hence t
865--         end
866--     end
867--     return t
868-- end
869--
870-- local utf32_to_utf8_be_t = function(t)
871--     if not t then
872--         return nil
873--     elseif type(t) == "string" then
874--         t = lpegmatch(utflinesplitter,t)
875--     end
876--     local result = { } -- we reuse result
877--     for i=1,#t do
878--         local r, more = 0, -1
879--         for a,b in bytepairs(t[i]) do
880--             if a and b then
881--                 if more < 0 then
882--                     more = 256*256*256*a + 256*256*b
883--                 else
884--                     r = r + 1
885--                     result[t] = utfchar(more + 256*a + b)
886--                     more = -1
887--                 end
888--             else
889--                 break
890--             end
891--         end
892--         t[i] = concat(result,"",1,r)
893--     end
894--     return t
895-- end
896--
897-- local utf32_to_utf8_le_t = function(t)
898--     if not t then
899--         return nil
900--     elseif type(t) == "string" then
901--         t = lpegmatch(utflinesplitter,t)
902--     end
903--     local result = { } -- we reuse result
904--     for i=1,#t do
905--         local r, more = 0, -1
906--         for a,b in bytepairs(t[i]) do
907--             if a and b then
908--                 if more < 0 then
909--                     more = 256*b + a
910--                 else
911--                     r = r + 1
912--                     result[t] = utfchar(more + 256*256*256*b + 256*256*a)
913--                     more = -1
914--                 end
915--             else
916--                 break
917--             end
918--         end
919--         t[i] = concat(result,"",1,r)
920--     end
921--     return t
922-- end
923
924local more = 0
925
926local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
927    local now = 256*byte(left) + byte(right)
928    if more > 0 then
929        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
930        more = 0
931        return utfchar(now)
932    elseif now >= 0xD800 and now <= 0xDBFF then
933        more = now
934        return "" -- else the c's end up in the stream
935    else
936        return utfchar(now)
937    end
938end
939
940local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
941    local now = 256*byte(left) + byte(right)
942    if more > 0 then
943        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
944        more = 0
945        return utfchar(now)
946    elseif now >= 0xD800 and now <= 0xDBFF then
947        more = now
948        return "" -- else the c's end up in the stream
949    else
950        return utfchar(now)
951    end
952end
953local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
954    return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
955end
956
957local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
958    return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
959end
960
961p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
962p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
963p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
964p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
965
966patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
967patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
968patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
969patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
970
971utf16_to_utf8_be = function(s)
972    if s and s ~= "" then
973        return lpegmatch(p_utf16_to_utf8_be,s)
974    else
975        return s
976    end
977end
978
979local utf16_to_utf8_be_t = function(t)
980    if not t then
981        return nil
982    elseif type(t) == "string" then
983        t = lpegmatch(utf_16_be_linesplitter,t)
984    end
985    for i=1,#t do
986        local s = t[i]
987        if s ~= "" then
988            t[i] = lpegmatch(p_utf16_to_utf8_be,s)
989        end
990    end
991    return t
992end
993
994utf16_to_utf8_le = function(s)
995    if s and s ~= "" then
996        return lpegmatch(p_utf16_to_utf8_le,s)
997    else
998        return s
999    end
1000end
1001
1002local utf16_to_utf8_le_t = function(t)
1003    if not t then
1004        return nil
1005    elseif type(t) == "string" then
1006        t = lpegmatch(utf_16_le_linesplitter,t)
1007    end
1008    for i=1,#t do
1009        local s = t[i]
1010        if s ~= "" then
1011            t[i] = lpegmatch(p_utf16_to_utf8_le,s)
1012        end
1013    end
1014    return t
1015end
1016
1017utf32_to_utf8_be = function(s)
1018    if s and s ~= "" then
1019        return lpegmatch(p_utf32_to_utf8_be,s)
1020    else
1021        return s
1022    end
1023end
1024
1025local utf32_to_utf8_be_t = function(t)
1026    if not t then
1027        return nil
1028    elseif type(t) == "string" then
1029        t = lpegmatch(utf_32_be_linesplitter,t)
1030    end
1031    for i=1,#t do
1032        local s = t[i]
1033        if s ~= "" then
1034            t[i] = lpegmatch(p_utf32_to_utf8_be,s)
1035        end
1036    end
1037    return t
1038end
1039
1040utf32_to_utf8_le = function(s)
1041    if s and s ~= "" then
1042        return lpegmatch(p_utf32_to_utf8_le,s)
1043    else
1044        return s
1045    end
1046end
1047
1048local utf32_to_utf8_le_t = function(t)
1049    if not t then
1050        return nil
1051    elseif type(t) == "string" then
1052        t = lpegmatch(utf_32_le_linesplitter,t)
1053    end
1054    for i=1,#t do
1055        local s = t[i]
1056        if s ~= "" then
1057            t[i] = lpegmatch(p_utf32_to_utf8_le,s)
1058        end
1059    end
1060    return t
1061end
1062
1063utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
1064utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
1065utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
1066utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
1067
1068utf.utf16_to_utf8_le   = utf16_to_utf8_le
1069utf.utf16_to_utf8_be   = utf16_to_utf8_be
1070utf.utf32_to_utf8_le   = utf32_to_utf8_le
1071utf.utf32_to_utf8_be   = utf32_to_utf8_be
1072
1073function utf.utf8_to_utf8_t(t)
1074    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
1075end
1076
1077function utf.utf16_to_utf8_t(t,endian)
1078    return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
1079end
1080
1081function utf.utf32_to_utf8_t(t,endian)
1082    return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
1083end
1084
1085if bit32 then
1086
1087    local rshift  = bit32.rshift
1088
1089    local function little(b)
1090        if b < 0x10000 then
1091            return char(b%256,rshift(b,8))
1092        else
1093            b = b - 0x10000
1094            local b1 = rshift(b,10) + 0xD800
1095            local b2 = b%1024 + 0xDC00
1096            return char(b1%256,rshift(b1,8),b2%256,rshift(b2,8))
1097        end
1098    end
1099
1100    local function big(b)
1101        if b < 0x10000 then
1102            return char(rshift(b,8),b%256)
1103        else
1104            b = b - 0x10000
1105            local b1 = rshift(b,10) + 0xD800
1106            local b2 = b%1024 + 0xDC00
1107            return char(rshift(b1,8),b1%256,rshift(b2,8),b2%256)
1108        end
1109    end
1110
1111    local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
1112    local b_remap = Cs((p_utf8byte/big   +P(1)/"")^0)
1113
1114    local function utf8_to_utf16_be(str,nobom)
1115        if nobom then
1116            return lpegmatch(b_remap,str)
1117        else
1118            return char(254,255) .. lpegmatch(b_remap,str)
1119        end
1120    end
1121
1122    local function utf8_to_utf16_le(str,nobom)
1123        if nobom then
1124            return lpegmatch(l_remap,str)
1125        else
1126            return char(255,254) .. lpegmatch(l_remap,str)
1127        end
1128    end
1129
1130    utf.utf8_to_utf16_be = utf8_to_utf16_be
1131    utf.utf8_to_utf16_le = utf8_to_utf16_le
1132
1133    function utf.utf8_to_utf16(str,littleendian,nobom)
1134        if littleendian then
1135            return utf8_to_utf16_le(str,nobom)
1136        else
1137            return utf8_to_utf16_be(str,nobom)
1138        end
1139    end
1140
1141end
1142
1143local pattern = Cs (
1144    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) *
1145    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
1146)
1147
1148function utf.tocodes(str,separator)
1149    return lpegmatch(pattern,str,1,separator or " ")
1150end
1151
1152function utf.ustring(s)
1153    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
1154end
1155
1156function utf.xstring(s)
1157    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
1158end
1159
1160function utf.toeight(str)
1161    if not str or str == "" then
1162        return nil
1163    end
1164    local utftype = lpegmatch(p_utfstricttype,str)
1165    if utftype == "utf-8" then
1166        return sub(str,4)               -- remove the bom
1167    elseif utftype == "utf-16-be" then
1168        return utf16_to_utf8_be(str)    -- bom gets removed
1169    elseif utftype == "utf-16-le" then
1170        return utf16_to_utf8_le(str)    -- bom gets removed
1171    else
1172        return str
1173    end
1174end
1175
1176--
1177
1178do
1179
1180    local p_nany = p_utf8character / ""
1181    local cache  = { }
1182
1183    function utf.count(str,what)
1184        if type(what) == "string" then
1185            local p = cache[what]
1186            if not p then
1187                p = Cs((P(what)/" " + p_nany)^0)
1188                cache[p] = p
1189            end
1190            return #lpegmatch(p,str)
1191        else -- 4 times slower but still faster than / function
1192            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
1193        end
1194    end
1195
1196end
1197
1198if not string.utfvalues then
1199
1200    -- So, a logical next step is to check for the values variant. It over five times
1201    -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1.
1202
1203    ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch
1204    local find =  string.find
1205
1206    local dummy = function()
1207        -- we share this one
1208    end
1209
1210    -- function string.utfvalues(str)
1211    --     local n = #str
1212    --     if n == 0 then
1213    --         return wrap(dummy)
1214    --     elseif n == 1 then
1215    --         return wrap(function() yield(utfbyte(str)) end)
1216    --     else
1217    --         return wrap(function() for s in gmatch(str,".[\128-\191]*") do
1218    --             yield(utfbyte(s))
1219    --         end end)
1220    --     end
1221    -- end
1222    --
1223    -- faster:
1224
1225    function string.utfvalues(str)
1226        local n = #str
1227        if n == 0 then
1228            return dummy
1229        elseif n == 1 then
1230            return function() return utfbyte(str) end
1231        else
1232            local p = 1
1233         -- local n = #str
1234            return function()
1235             -- if p <= n then -- slower than the last find
1236                    local b, e = find(str,".[\128-\191]*",p)
1237                    if b then
1238                        p = e + 1
1239                        return utfbyte(sub(str,b,e))
1240                    end
1241             -- end
1242            end
1243        end
1244    end
1245
1246    -- slower:
1247    --
1248    -- local pattern = C(p_utf8character) * Cp()
1249    -- ----- pattern = p_utf8character/utfbyte * Cp()
1250    -- ----- pattern = p_utf8byte * Cp()
1251    --
1252    -- function string.utfvalues(str) -- one of the cases where a find is faster than an lpeg
1253    --     local n = #str
1254    --     if n == 0 then
1255    --         return dummy
1256    --     elseif n == 1 then
1257    --         return function() return utfbyte(str) end
1258    --     else
1259    --         local p = 1
1260    --         return function()
1261    --             local s, e = lpegmatch(pattern,str,p)
1262    --             if e then
1263    --                 p = e
1264    --                 return utfbyte(s)
1265    --              -- return s
1266    --             end
1267    --         end
1268    --     end
1269    -- end
1270
1271end
1272
1273utf.values = string.utfvalues
1274
1275function utf.chrlen(u) -- u is number
1276    return
1277        (u < 0x80 and 1) or
1278        (u < 0xE0 and 2) or
1279        (u < 0xF0 and 3) or
1280        (u < 0xF8 and 4) or
1281        (u < 0xFC and 5) or
1282        (u < 0xFE and 6) or 0
1283end
1284
1285-- hashing saves a little but not that much in practice
1286--
1287-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end)
1288
1289if bit32 then
1290
1291    local extract = bit32.extract
1292    local char    = string.char
1293
1294    function utf.toutf32string(n)
1295        if n <= 0xFF then
1296            return
1297                char(n) ..
1298                "\000\000\000"
1299        elseif n <= 0xFFFF then
1300            return
1301                char(extract(n, 0,8)) ..
1302                char(extract(n, 8,8)) ..
1303                "\000\000"
1304        elseif n <= 0xFFFFFF then
1305            return
1306                char(extract(n, 0,8)) ..
1307                char(extract(n, 8,8)) ..
1308                char(extract(n,16,8)) ..
1309                "\000"
1310        else
1311            return
1312                char(extract(n, 0,8)) ..
1313                char(extract(n, 8,8)) ..
1314                char(extract(n,16,8)) ..
1315                char(extract(n,24,8))
1316        end
1317    end
1318
1319end
1320
1321-- goodie:
1322
1323local len = utf.len
1324local rep = rep
1325
1326function string.utfpadd(s,n)
1327    if n and n ~= 0 then
1328        local l = len(s)
1329        if n > 0 then
1330            local d = n - l
1331            if d > 0 then
1332                return rep(c or " ",d) .. s
1333            end
1334        else
1335            local d = - n - l
1336            if d > 0 then
1337                return s .. rep(c or " ",d)
1338            end
1339        end
1340    end
1341    return s
1342end
1343
1344-- goodies
1345
1346do
1347
1348    local utfcharacters = utf.characters or string.utfcharacters
1349    local utfchar       = utf.char       or string.utfcharacter
1350
1351    lpeg.UP = P
1352
1353    if utfcharacters then
1354
1355        function lpeg.US(str)
1356            local p = P(false)
1357            for uc in utfcharacters(str) do
1358                p = p + P(uc)
1359            end
1360            return p
1361        end
1362
1363    else
1364
1365        function lpeg.US(str)
1366            local p = P(false)
1367            local f = function(uc)
1368                p = p + P(uc)
1369            end
1370            lpegmatch((p_utf8char/f)^0,str)
1371            return p
1372        end
1373
1374    end
1375
1376    local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture
1377
1378    function lpeg.UR(str,more)
1379        local first, last
1380        if type(str) == "number" then
1381            first = str
1382            last = more or first
1383        else
1384            first, last = lpegmatch(range,str)
1385            if not last then
1386                return P(str)
1387            end
1388        end
1389        if first == last then
1390            return P(str)
1391        end
1392        if not utfchar then
1393            utfchar = utf.char -- maybe delayed
1394        end
1395        if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
1396            local p = P(false)
1397            for i=first,last do
1398                p = p + P(utfchar(i))
1399            end
1400            return p -- nil when invalid range
1401        else
1402            local f = function(b)
1403                return b >= first and b <= last
1404            end
1405            -- tricky, these nested captures
1406            return p_utf8byte / f -- nil when invalid range
1407        end
1408    end
1409
1410    -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
1411
1412end
1413
Source Browser ?