l-unicode.lmt /size: 22 Kb    last modification: 2021-10-28 13:51
1if not modules then modules = { } end modules ['l-unicode'] = {
2    version   = 1.001,
3    optimize  = true,
4    comment   = "companion to luat-lib.mkxl",
5    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6    copyright = "PRAGMA ADE / ConTeXt Development Team",
7    license   = "see context related readme files"
8}
9
10-- See l-unicode.lua for the more generic (also 5.2) versions of the
11-- functions below ... that file evolved over time.
12--
13-- In lua 5.3+ we have:
14--
15-- utf8.char(···)         : concatinated
16-- utf8.charpatt          : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
17-- utf8.codes(s)          : for p, c in utf8.codes(s) do body end
18-- utf8.codepoint(s [, i [, j]])
19-- utf8.len(s [, i])
20-- utf8.offset(s, n [, i])
21
22utf     = utf or { }
23unicode = nil
24
25local type = type
26local char, byte, format, sub, gmatch, rep = string.char, string.byte, string.format, string.sub, string.gmatch, string.rep
27local concat = table.concat
28local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
29
30local lpegmatch       = lpeg.match
31local patterns        = lpeg.patterns
32local tabletopattern  = lpeg.utfchartabletopattern
33
34local finder          = lpeg.finder
35local replacer        = lpeg.replacer
36
37local p_utftype       = patterns.utftype
38local p_utfstricttype = patterns.utfstricttype
39local p_utfoffset     = patterns.utfoffset
40local p_utf8character = patterns.utf8character
41local p_utf8char      = patterns.utf8char
42local p_utf8byte      = patterns.utf8byte
43local p_utfbom        = patterns.utfbom
44local p_newline       = patterns.newline
45local p_whitespace    = patterns.whitespace
46
47local utfchar         = string.utfcharacter
48local utfbyte         = string.utfvalue
49local utflength       = string.utflength
50local utfcharacters   = string.utfcharacters
51local utfbytepairs    = string.bytepairs
52
53-- string.utfvalues
54-- string.characters
55-- string.characterpairs
56-- string.bytes
57-- string.utflength
58-- string.utfvalues
59
60utf.char       = utfchar
61utf.byte       = utfbyte
62utf.len        = utflength
63utf.length     = utflength
64utf.characters = utfcharacters
65utf.bytepairs  = utfbytepairs
66
67function utf.filetype(data)
68    return data and lpegmatch(p_utftype,data) or "unknown"
69end
70
71do
72
73    local toentities = Cs (
74        (
75            patterns.utf8one
76                + (
77                    patterns.utf8two
78                  + patterns.utf8three
79                  + patterns.utf8four
80                ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
81        )^0
82    )
83
84    patterns.toentities = toentities
85
86    function utf.toentities(str)
87        return lpegmatch(toentities,str)
88    end
89
90end
91
92do
93
94    local one  = P(1)
95    local two  = C(1) * C(1)
96    local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
97
98    local pattern =
99        P("\254\255") * Cs( (
100            four  / function(a,b,c,d)
101                        local ab = 0xFF * byte(a) + byte(b)
102                        local cd = 0xFF * byte(c) + byte(d)
103                        return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
104                    end
105          + two   / function(a,b)
106                        return utfchar(byte(a)*256 + byte(b))
107                    end
108          + one
109        )^1 )
110      + P("\255\254") * Cs( (
111            four  / function(b,a,d,c)
112                        local ab = 0xFF * byte(a) + byte(b)
113                        local cd = 0xFF * byte(c) + byte(d)
114                        return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
115                    end
116          + two   / function(b,a)
117                        return utfchar(byte(a)*256 + byte(b))
118                    end
119          + one
120        )^1 )
121
122    function string.toutf(s) -- in string namespace
123        return lpegmatch(pattern,s) or s -- todo: utf32
124    end
125
126end
127
128do
129
130    local validatedutf = Cs (
131        (
132            patterns.utf8one
133          + patterns.utf8two
134          + patterns.utf8three
135          + patterns.utf8four
136          + P(1) / ""
137        )^0
138    )
139
140    patterns.validatedutf = validatedutf
141
142    function utf.is_valid(str)
143        return type(str) == "string" and lpegmatch(validatedutf,str) or false
144    end
145
146end
147
148if not utf.sub then
149
150    -- also negative indices, upto 10 times slower than a c variant
151
152    local b, e, n, first, last = 0, 0, 0, 0, 0
153
154    local function slide_zero(s,p)
155        n = n + 1
156        if n >= last then
157            e = p - 1
158        else
159            return p
160        end
161    end
162
163    local function slide_one(s,p)
164        n = n + 1
165        if n == first then
166            b = p
167        end
168        if n >= last then
169            e = p - 1
170        else
171            return p
172        end
173    end
174
175    local function slide_two(s,p)
176        n = n + 1
177        if n == first then
178            b = p
179        else
180            return true
181        end
182    end
183
184    local pattern_zero  = Cmt(p_utf8character,slide_zero)^0
185    local pattern_one   = Cmt(p_utf8character,slide_one )^0
186    local pattern_two   = Cmt(p_utf8character,slide_two )^0
187
188    local pattern_first = C(p_utf8character)
189
190    function utf.sub(str,start,stop)
191        if not start then
192            return str
193        end
194        if start == 0 then
195            start = 1
196        end
197        if not stop then
198            if start < 0 then
199                local l = utflength(str) -- we can inline this function if needed
200                start = l + start
201            else
202                start = start - 1
203            end
204            b, n, first = 0, 0, start
205            lpegmatch(pattern_two,str)
206            if n >= first then
207                return sub(str,b)
208            else
209                return ""
210            end
211        end
212        if start < 0 or stop < 0 then
213            local l = utf.length(str)
214            if start < 0 then
215                start = l + start
216                if start <= 0 then
217                    start = 1
218                else
219                    start = start + 1
220                end
221            end
222            if stop < 0 then
223                stop = l + stop
224                if stop == 0 then
225                    stop = 1
226                else
227                    stop = stop + 1
228                end
229            end
230        end
231        if start == 1 and stop == 1 then
232            return lpegmatch(pattern_first,str) or ""
233        elseif start > stop then
234            return ""
235        elseif start > 1 then
236            b, e, n, first, last = 0, 0, 0, start - 1, stop
237            lpegmatch(pattern_one,str)
238            if n >= first and e == 0 then
239                e = #str
240            end
241            return sub(str,b,e)
242        else
243            b, e, n, last = 1, 0, 0, stop
244            lpegmatch(pattern_zero,str)
245            if e == 0 then
246                e = #str
247            end
248            return sub(str,b,e)
249        end
250    end
251
252    -- local n = 100000
253    -- local str = string.rep("123456àáâãäå",100)
254    --
255    -- for i=-15,15,1 do
256    --     for j=-15,15,1 do
257    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
258    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
259    --         end
260    --     end
261    --     if utf.xsub(str,i) ~= utf.sub(str,i) then
262    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
263    --     end
264    -- end
265
266    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
267    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
268    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
269    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   ))
270    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   ))
271    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
272    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
273    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
274    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
275    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
276    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
277    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   ))
278
279end
280
281function utf.remapper(mapping,option,action) -- static also returns a pattern
282    local variant = type(mapping)
283    if variant == "table" then
284        action = action or mapping
285        if option == "dynamic" then
286            local pattern = false
287            table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
288            return function(str)
289                if not str or str == "" then
290                    return ""
291                else
292                    if not pattern then
293                        pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
294                    end
295                    return lpegmatch(pattern,str)
296                end
297            end
298        elseif option == "pattern" then
299            return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
300     -- elseif option == "static" then
301        else
302            local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
303            return function(str)
304                if not str or str == "" then
305                    return ""
306                else
307                    return lpegmatch(pattern,str)
308                end
309            end, pattern
310        end
311    elseif variant == "function" then
312        if option == "pattern" then
313            return Cs((p_utf8character/mapping + p_utf8character)^0)
314        else
315            local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
316            return function(str)
317                if not str or str == "" then
318                    return ""
319                else
320                    return lpegmatch(pattern,str)
321                end
322            end, pattern
323        end
324    else
325        -- is actually an error
326        return function(str)
327            return str or ""
328        end
329    end
330end
331
332-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
333-- print(remap("abcd 1234 abcd"))
334
335function utf.replacer(t) -- no precheck, always string builder
336    local r = replacer(t,false,false,true)
337    return function(str)
338        return lpegmatch(r,str)
339    end
340end
341
342function utf.subtituter(t) -- with precheck and no building if no match
343    local f = finder  (t)
344    local r = replacer(t,false,false,true)
345    return function(str)
346        local i = lpegmatch(f,str)
347        if not i then
348            return str
349        elseif i > #str then
350            return str
351        else
352         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
353            return lpegmatch(r,str)
354        end
355    end
356end
357
358-- inspect(utf.split("a b c d"))
359-- inspect(utf.split("a b c d",true))
360
361local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline)
362local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
363local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
364local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
365
366patterns.utflinesplitter  = utflinesplitter
367
368function utf.splitlines(str)
369    return lpegmatch(utflinesplitter,str or "")
370end
371
372function utf.split(str,ignorewhitespace) -- new
373    if ignorewhitespace then
374        return lpegmatch(utfcharsplitter_iws,str or "")
375    else
376        return lpegmatch(utfcharsplitter_ows,str or "")
377    end
378end
379
380function utf.totable(str) -- keeps bom
381    return lpegmatch(utfcharsplitter_raw,str)
382end
383
384-- 0  EF BB BF      UTF-8
385-- 1  FF FE         UTF-16-little-endian
386-- 2  FE FF         UTF-16-big-endian
387-- 3  FF FE 00 00   UTF-32-little-endian
388-- 4  00 00 FE FF   UTF-32-big-endian
389--
390-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
391
392-- utf.name = {
393--     [0] = 'utf-8',
394--     [1] = 'utf-16-le',
395--     [2] = 'utf-16-be',
396--     [3] = 'utf-32-le',
397--     [4] = 'utf-32-be'
398-- }
399
400function utf.magic(f) -- not used
401    local str = f:read(4) or ""
402    local off = lpegmatch(p_utfoffset,str)
403    if off < 4 then
404        f:seek('set',off)
405    end
406    return lpegmatch(p_utftype,str)
407end
408
409local utf_16_be_getbom = patterns.utfbom_16_be^-1
410local utf_16_le_getbom = patterns.utfbom_16_le^-1
411local utf_32_be_getbom = patterns.utfbom_32_be^-1
412local utf_32_le_getbom = patterns.utfbom_32_le^-1
413
414local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
415local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
416local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
417local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
418
419local more = 0
420
421local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
422    local now = 256*byte(left) + byte(right)
423    if more > 0 then
424        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
425        more = 0
426        return utfchar(now)
427    elseif now >= 0xD800 and now <= 0xDBFF then
428        more = now
429        return "" -- else the c's end up in the stream
430    else
431        return utfchar(now)
432    end
433end
434
435local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
436    local now = 256*byte(left) + byte(right)
437    if more > 0 then
438        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
439        more = 0
440        return utfchar(now)
441    elseif now >= 0xD800 and now <= 0xDBFF then
442        more = now
443        return "" -- else the c's end up in the stream
444    else
445        return utfchar(now)
446    end
447end
448
449local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
450    return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
451end
452
453local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
454    return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
455end
456
457p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
458p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
459p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
460p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
461
462patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
463patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
464patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
465patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
466
467local utf16_to_utf8_be = function(s)
468    if s and s ~= "" then
469        return lpegmatch(p_utf16_to_utf8_be,s)
470    else
471        return s
472    end
473end
474
475local utf16_to_utf8_be_t = function(t)
476    if not t then
477        return nil
478    elseif type(t) == "string" then
479        t = lpegmatch(utf_16_be_linesplitter,t)
480    end
481    for i=1,#t do
482        local s = t[i]
483        if s ~= "" then
484            t[i] = lpegmatch(p_utf16_to_utf8_be,s)
485        end
486    end
487    return t
488end
489
490local utf16_to_utf8_le = function(s)
491    if s and s ~= "" then
492        return lpegmatch(p_utf16_to_utf8_le,s)
493    else
494        return s
495    end
496end
497
498local utf16_to_utf8_le_t = function(t)
499    if not t then
500        return nil
501    elseif type(t) == "string" then
502        t = lpegmatch(utf_16_le_linesplitter,t)
503    end
504    for i=1,#t do
505        local s = t[i]
506        if s ~= "" then
507            t[i] = lpegmatch(p_utf16_to_utf8_le,s)
508        end
509    end
510    return t
511end
512
513local utf32_to_utf8_be = function(s)
514    if s and s ~= "" then
515        return lpegmatch(p_utf32_to_utf8_be,s)
516    else
517        return s
518    end
519end
520
521local utf32_to_utf8_be_t = function(t)
522    if not t then
523        return nil
524    elseif type(t) == "string" then
525        t = lpegmatch(utf_32_be_linesplitter,t)
526    end
527    for i=1,#t do
528        local s = t[i]
529        if s ~= "" then
530            t[i] = lpegmatch(p_utf32_to_utf8_be,s)
531        end
532    end
533    return t
534end
535
536local utf32_to_utf8_le = function(s)
537    if s and s ~= "" then
538        return lpegmatch(p_utf32_to_utf8_le,s)
539    else
540        return s
541    end
542end
543
544local utf32_to_utf8_le_t = function(t)
545    if not t then
546        return nil
547    elseif type(t) == "string" then
548        t = lpegmatch(utf_32_le_linesplitter,t)
549    end
550    for i=1,#t do
551        local s = t[i]
552        if s ~= "" then
553            t[i] = lpegmatch(p_utf32_to_utf8_le,s)
554        end
555    end
556    return t
557end
558
559utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
560utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
561utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
562utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
563
564utf.utf16_to_utf8_le   = utf16_to_utf8_le
565utf.utf16_to_utf8_be   = utf16_to_utf8_be
566utf.utf32_to_utf8_le   = utf32_to_utf8_le
567utf.utf32_to_utf8_be   = utf32_to_utf8_be
568
569function utf.utf8_to_utf8_t(t)
570    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
571end
572
573function utf.utf16_to_utf8_t(t,endian)
574    return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
575end
576
577function utf.utf32_to_utf8_t(t,endian)
578    return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
579end
580
581do
582
583    local function little(b)
584        if b < 0x10000 then
585            return char(b%256,(b>>8))
586        else
587            b = b - 0x10000
588            local b1 = (b>>10) + 0xD800
589            local b2 = b%1024 + 0xDC00
590            return char(b1%256,(b1>>8),b2%256,(b2>>8))
591        end
592    end
593
594    local function big(b)
595        if b < 0x10000 then
596            return char((b>>8),b%256)
597        else
598            b = b - 0x10000
599            local b1 = (b>>10) + 0xD800
600            local b2 = b%1024 + 0xDC00
601            return char((b1>>8),b1%256,(b2>>8),b2%256)
602        end
603    end
604
605    local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
606    local b_remap = Cs((p_utf8byte/big   +P(1)/"")^0)
607
608    local function utf8_to_utf16_be(str,nobom)
609        if nobom then
610            return lpegmatch(b_remap,str)
611        else
612            return char(254,255) .. lpegmatch(b_remap,str)
613        end
614    end
615
616    local function utf8_to_utf16_le(str,nobom)
617        if nobom then
618            return lpegmatch(l_remap,str)
619        else
620            return char(255,254) .. lpegmatch(l_remap,str)
621        end
622    end
623
624    utf.utf8_to_utf16_be = utf8_to_utf16_be
625    utf.utf8_to_utf16_le = utf8_to_utf16_le
626
627    function utf.utf8_to_utf16(str,littleendian,nobom)
628        if littleendian then
629            return utf8_to_utf16_le(str,nobom)
630        else
631            return utf8_to_utf16_be(str,nobom)
632        end
633    end
634
635end
636
637local pattern = Cs (
638    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) *
639    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
640)
641
642function utf.tocodes(str,separator)
643    return lpegmatch(pattern,str,1,separator or " ")
644end
645
646function utf.ustring(s)
647    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
648end
649
650function utf.xstring(s)
651    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
652end
653
654function utf.toeight(str)
655    if not str or str == "" then
656        return nil
657    end
658    local utftype = lpegmatch(p_utfstricttype,str)
659    if utftype == "utf-8" then
660        return sub(str,4)               -- remove the bom
661    elseif utftype == "utf-16-be" then
662        return utf16_to_utf8_be(str)    -- bom gets removed
663    elseif utftype == "utf-16-le" then
664        return utf16_to_utf8_le(str)    -- bom gets removed
665    else
666        return str
667    end
668end
669
670do
671
672    local p_nany = p_utf8character / ""
673    local cache  = { }
674
675    function utf.count(str,what)
676        if type(what) == "string" then
677            local p = cache[what]
678            if not p then
679                p = Cs((P(what)/" " + p_nany)^0)
680                cache[p] = p
681            end
682            return #lpegmatch(p,str)
683        else -- 4 times slower but still faster than / function
684            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
685        end
686    end
687
688end
689
690utf.values = string.utfvalues
691
692function utf.chrlen(u) -- u is number
693    return
694        (u < 0x80 and 1) or
695        (u < 0xE0 and 2) or
696        (u < 0xF0 and 3) or
697        (u < 0xF8 and 4) or
698        (u < 0xFC and 5) or
699        (u < 0xFE and 6) or 0
700end
701
702-- hashing saves a little but not that much in practice
703--
704-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end)
705
706do
707
708    local extract = bit32.extract
709    local char    = string.char
710
711    function utf.toutf32string(n)
712        if n <= 0xFF then
713            return
714                char(n) ..
715                "\000\000\000"
716        elseif n <= 0xFFFF then
717            return
718                char(extract(n, 0,8)) ..
719                char(extract(n, 8,8)) ..
720                "\000\000"
721        elseif n <= 0xFFFFFF then
722            return
723                char(extract(n, 0,8)) ..
724                char(extract(n, 8,8)) ..
725                char(extract(n,16,8)) ..
726                "\000"
727        else
728            return
729                char(extract(n, 0,8)) ..
730                char(extract(n, 8,8)) ..
731                char(extract(n,16,8)) ..
732                char(extract(n,24,8))
733        end
734    end
735
736end
737
738-- goodie:
739
740function string.utfpadd(s,n)
741    if n and n ~= 0 then
742        local l = utflength(s)
743        if n > 0 then
744            local d = n - l
745            if d > 0 then
746                return rep(c or " ",d) .. s
747            end
748        else
749            local d = - n - l
750            if d > 0 then
751                return s .. rep(c or " ",d)
752            end
753        end
754    end
755    return s
756end
757
758-- goodies
759
760do
761
762    lpeg.UP = P
763
764    function lpeg.US(str)
765        local p = P(false)
766        for uc in utfcharacters(str) do
767            p = p + P(uc)
768        end
769        return p
770    end
771
772    local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture
773
774    function lpeg.UR(str,more)
775        local first, last
776        if type(str) == "number" then
777            first = str
778            last = more or first
779        else
780            first, last = lpegmatch(range,str)
781            if not last then
782                return P(str)
783            end
784        end
785        if first == last then
786            return P(str)
787        end
788        if not utfchar then
789            utfchar = utf.char -- maybe delayed
790        end
791        if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
792            local p = P(false)
793            for i=first,last do
794                p = p + P(utfchar(i))
795            end
796            return p -- nil when invalid range
797        else
798            local f = function(b)
799                return b >= first and b <= last
800            end
801            -- tricky, these nested captures
802            return p_utf8byte / f -- nil when invalid range
803        end
804    end
805
806    -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
807
808end
809