l-unicode.lmt /size: 21 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['l-unicode'] = {
2    version   = 1.001,
3    optimize  = true,
4    comment   = "companion to luat-lib.mkxl",
5    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6    copyright = "PRAGMA ADE / ConTeXt Development Team",
7    license   = "see context related readme files"
8}
9
10-- See l-unicode.lua for the more generic (also 5.2) versions of the
11-- functions below ... that file evolved over time.
12--
13-- In lua 5.3+ we have:
14--
15-- utf8.char(···)         : concatinated
16-- utf8.charpatt          : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
17-- utf8.codes(s)          : for p, c in utf8.codes(s) do body end
18-- utf8.codepoint(s [, i [, j]])
19-- utf8.len(s [, i])
20-- utf8.offset(s, n [, i])
21
22utf     = utf or { }
23unicode = nil
24
25local type = type
26local char, byte, format, sub, gmatch, rep = string.char, string.byte, string.format, string.sub, string.gmatch, string.rep
27local concat = table.concat
28local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp
29
30local lpegmatch       = lpeg.match
31local patterns        = lpeg.patterns
32local tabletopattern  = lpeg.utfchartabletopattern
33
34local finder          = lpeg.finder
35local replacer        = lpeg.replacer
36
37local p_utftype       = patterns.utftype
38local p_utfstricttype = patterns.utfstricttype
39local p_utfoffset     = patterns.utfoffset
40local p_utf8character = patterns.utf8character
41local p_utf8char      = patterns.utf8char
42local p_utf8byte      = patterns.utf8byte
43local p_utfbom        = patterns.utfbom
44local p_newline       = patterns.newline
45local p_whitespace    = patterns.whitespace
46
47local utfchar         = string.utfcharacter
48local utfbyte         = string.utfvalue
49local utflength       = string.utflength
50local utfcharacters   = string.utfcharacters
51local utfbytepairs    = string.bytepairs
52
53-- string.utfvalues
54-- string.characters
55-- string.characterpairs
56-- string.bytes
57-- string.utflength
58-- string.utfvalues
59
60utf.char       = utfchar
61utf.byte       = utfbyte
62utf.len        = utflength
63utf.length     = utflength
64utf.characters = utfcharacters
65utf.bytepairs  = utfbytepairs
66
67function utf.filetype(data)
68    return data and lpegmatch(p_utftype,data) or "unknown"
69end
70
71do
72
73    local toentities = Cs (
74        (
75            patterns.utf8one
76                + (
77                    patterns.utf8two
78                  + patterns.utf8three
79                  + patterns.utf8four
80                ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end
81        )^0
82    )
83
84    patterns.toentities = toentities
85
86    function utf.toentities(str)
87        return lpegmatch(toentities,str)
88    end
89
90end
91
92do
93
94    local one  = P(1)
95    local two  = C(1) * C(1)
96    local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1)
97
98    local pattern =
99        P("\254\255") * Cs( (
100            four  / function(a,b,c,d)
101                        local ab = 0xFF * byte(a) + byte(b)
102                        local cd = 0xFF * byte(c) + byte(d)
103                        return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
104                    end
105          + two   / function(a,b)
106                        return utfchar(byte(a)*256 + byte(b))
107                    end
108          + one
109        )^1 )
110      + P("\255\254") * Cs( (
111            four  / function(b,a,d,c)
112                        local ab = 0xFF * byte(a) + byte(b)
113                        local cd = 0xFF * byte(c) + byte(d)
114                        return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000)
115                    end
116          + two   / function(b,a)
117                        return utfchar(byte(a)*256 + byte(b))
118                    end
119          + one
120        )^1 )
121
122    function string.toutf(s) -- in string namespace
123        return lpegmatch(pattern,s) or s -- todo: utf32
124    end
125
126end
127
128do
129
130    local validatedutf = Cs (
131        (
132            patterns.utf8one
133          + patterns.utf8two
134          + patterns.utf8three
135          + patterns.utf8four
136          + P(1) / ""
137        )^0
138    )
139
140    patterns.validatedutf = validatedutf
141
142    function utf.is_valid(str)
143        return type(str) == "string" and lpegmatch(validatedutf,str) or false
144    end
145
146end
147
148if not utf.sub then
149
150    -- also negative indices, upto 10 times slower than a c variant
151
152    local b, e, n, first, last = 0, 0, 0, 0, 0
153
154    local function slide_zero(s,p)
155        n = n + 1
156        if n >= last then
157            e = p - 1
158        else
159            return p
160        end
161    end
162
163    local function slide_one(s,p)
164        n = n + 1
165        if n == first then
166            b = p
167        end
168        if n >= last then
169            e = p - 1
170        else
171            return p
172        end
173    end
174
175    local function slide_two(s,p)
176        n = n + 1
177        if n == first then
178            b = p
179        else
180            return true
181        end
182    end
183
184    local pattern_zero  = Cmt(p_utf8character,slide_zero)^0
185    local pattern_one   = Cmt(p_utf8character,slide_one )^0
186    local pattern_two   = Cmt(p_utf8character,slide_two )^0
187
188    local pattern_first = C(p_utf8character)
189
190    function utf.sub(str,start,stop)
191        if not start then
192            return str
193        end
194        if start == 0 then
195            start = 1
196        end
197        if not stop then
198            if start < 0 then
199                local l = utflength(str) -- we can inline this function if needed
200                start = l + start
201            else
202                start = start - 1
203            end
204            b, n, first = 0, 0, start
205            lpegmatch(pattern_two,str)
206            if n >= first then
207                return sub(str,b)
208            else
209                return ""
210            end
211        end
212        if start < 0 or stop < 0 then
213            local l = utf.length(str)
214            if start < 0 then
215                start = l + start
216                if start <= 0 then
217                    start = 1
218                else
219                    start = start + 1
220                end
221            end
222            if stop < 0 then
223                stop = l + stop
224                if stop == 0 then
225                    stop = 1
226                else
227                    stop = stop + 1
228                end
229            end
230        end
231        if start == 1 and stop == 1 then
232            return lpegmatch(pattern_first,str) or ""
233        elseif start > stop then
234            return ""
235        elseif start > 1 then
236            b, e, n, first, last = 0, 0, 0, start - 1, stop
237            lpegmatch(pattern_one,str)
238            if n >= first and e == 0 then
239                e = #str
240            end
241            return sub(str,b,e)
242        else
243            b, e, n, last = 1, 0, 0, stop
244            lpegmatch(pattern_zero,str)
245            if e == 0 then
246                e = #str
247            end
248            return sub(str,b,e)
249        end
250    end
251
252    -- local n = 100000
253    -- local str = string.rep("123456àáâãäå",100)
254    --
255    -- for i=-15,15,1 do
256    --     for j=-15,15,1 do
257    --         if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then
258    --             print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j))
259    --         end
260    --     end
261    --     if utf.xsub(str,i) ~= utf.sub(str,i) then
262    --         print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i))
263    --     end
264    -- end
265
266    -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7))
267    -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7))
268    -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9))
269    -- print(" 4   ",utf.xsub(str, 4   ),utf.sub(str, 4   ))
270    -- print(" 0   ",utf.xsub(str, 0   ),utf.sub(str, 0   ))
271    -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0))
272    -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4))
273    -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0))
274    -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0))
275    -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3))
276    -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3))
277    -- print("-3   ",utf.xsub(str,-3   ),utf.sub(str,-3   ))
278
279end
280
281function utf.remapper(mapping,option,action) -- static also returns a pattern
282    local variant = type(mapping)
283    if variant == "table" then
284        action = action or mapping
285        if option == "dynamic" then
286            local pattern = false
287            table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end)
288            return function(str)
289                if not str or str == "" then
290                    return ""
291                else
292                    if not pattern then
293                        pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
294                    end
295                    return lpegmatch(pattern,str)
296                end
297            end
298        elseif option == "pattern" then
299            return Cs((tabletopattern(mapping)/action + p_utf8character)^0)
300     -- elseif option == "static" then
301        else
302            local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0)
303            return function(str)
304                if not str or str == "" then
305                    return ""
306                else
307                    return lpegmatch(pattern,str)
308                end
309            end, pattern
310        end
311    elseif variant == "function" then
312        if option == "pattern" then
313            return Cs((p_utf8character/mapping + p_utf8character)^0)
314        else
315            local pattern = Cs((p_utf8character/mapping + p_utf8character)^0)
316            return function(str)
317                if not str or str == "" then
318                    return ""
319                else
320                    return lpegmatch(pattern,str)
321                end
322            end, pattern
323        end
324    else
325        -- is actually an error
326        return function(str)
327            return str or ""
328        end
329    end
330end
331
332-- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" }
333-- print(remap("abcd 1234 abcd"))
334
335function utf.replacer(t) -- no precheck, always string builder
336    local r = replacer(t,false,false,true)
337    return function(str)
338        return lpegmatch(r,str)
339    end
340end
341
342function utf.subtituter(t) -- with precheck and no building if no match
343    local f = finder  (t)
344    local r = replacer(t,false,false,true)
345    return function(str)
346        local i = lpegmatch(f,str)
347        if not i then
348            return str
349        elseif i > #str then
350            return str
351        else
352         -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower
353            return lpegmatch(r,str)
354        end
355    end
356end
357
358-- inspect(utf.split("a b c d"))
359-- inspect(utf.split("a b c d",true))
360
361local utflinesplitter     = p_utfbom^-1 * lpeg.tsplitat(p_newline)
362local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0)
363local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0)
364local utfcharsplitter_raw = Ct(C(p_utf8character)^0)
365
366patterns.utflinesplitter  = utflinesplitter
367
368function utf.splitlines(str)
369    return lpegmatch(utflinesplitter,str or "")
370end
371
372function utf.split(str,ignorewhitespace) -- new
373    if ignorewhitespace then
374        return lpegmatch(utfcharsplitter_iws,str or "")
375    else
376        return lpegmatch(utfcharsplitter_ows,str or "")
377    end
378end
379
380function utf.totable(str) -- keeps bom
381    return lpegmatch(utfcharsplitter_raw,str)
382end
383
384-- 0  EF BB BF      UTF-8
385-- 1  FF FE         UTF-16-little-endian
386-- 2  FE FF         UTF-16-big-endian
387-- 3  FF FE 00 00   UTF-32-little-endian
388-- 4  00 00 FE FF   UTF-32-big-endian
389--
390-- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated
391
392-- utf.name = {
393--     [0] = 'utf-8',
394--     [1] = 'utf-16-le',
395--     [2] = 'utf-16-be',
396--     [3] = 'utf-32-le',
397--     [4] = 'utf-32-be'
398-- }
399
400function utf.magic(f) -- not used
401    local str = f:read(4) or ""
402    local off = lpegmatch(p_utfoffset,str)
403    if off < 4 then
404        f:seek('set',off)
405    end
406    return lpegmatch(p_utftype,str)
407end
408
409local utf_16_be_getbom = patterns.utfbom_16_be^-1
410local utf_16_le_getbom = patterns.utfbom_16_le^-1
411local utf_32_be_getbom = patterns.utfbom_32_be^-1
412local utf_32_le_getbom = patterns.utfbom_32_le^-1
413
414local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl)
415local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl)
416local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl)
417local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl)
418
419local more = 0
420
421local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right)
422    local now = 256*byte(left) + byte(right)
423    if more > 0 then
424        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
425        more = 0
426        return utfchar(now)
427    elseif now >= 0xD800 and now <= 0xDBFF then
428        more = now
429        return "" -- else the c's end up in the stream
430    else
431        return utfchar(now)
432    end
433end
434
435local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left)
436    local now = 256*byte(left) + byte(right)
437    if more > 0 then
438        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
439        more = 0
440        return utfchar(now)
441    elseif now >= 0xD800 and now <= 0xDBFF then
442        more = now
443        return "" -- else the c's end up in the stream
444    else
445        return utfchar(now)
446    end
447end
448
449local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
450    return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d))
451end
452
453local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d)
454    return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a))
455end
456
457p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0)
458p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0)
459p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0)
460p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0)
461
462patterns.utf16_to_utf8_be = p_utf16_to_utf8_be
463patterns.utf16_to_utf8_le = p_utf16_to_utf8_le
464patterns.utf32_to_utf8_be = p_utf32_to_utf8_be
465patterns.utf32_to_utf8_le = p_utf32_to_utf8_le
466
467local utf16_to_utf8_be = function(s)
468    if s and s ~= "" then
469        return lpegmatch(p_utf16_to_utf8_be,s)
470    else
471        return s
472    end
473end
474
475local utf16_to_utf8_be_t = function(t)
476    if not t then
477        return nil
478    elseif type(t) == "string" then
479        t = lpegmatch(utf_16_be_linesplitter,t)
480    end
481    for i=1,#t do
482        local s = t[i]
483        if s ~= "" then
484            t[i] = lpegmatch(p_utf16_to_utf8_be,s)
485        end
486    end
487    return t
488end
489
490local utf16_to_utf8_le = function(s)
491    if s and s ~= "" then
492        return lpegmatch(p_utf16_to_utf8_le,s)
493    else
494        return s
495    end
496end
497
498local utf16_to_utf8_le_t = function(t)
499    if not t then
500        return nil
501    elseif type(t) == "string" then
502        t = lpegmatch(utf_16_le_linesplitter,t)
503    end
504    for i=1,#t do
505        local s = t[i]
506        if s ~= "" then
507            t[i] = lpegmatch(p_utf16_to_utf8_le,s)
508        end
509    end
510    return t
511end
512
513local utf32_to_utf8_be = function(s)
514    if s and s ~= "" then
515        return lpegmatch(p_utf32_to_utf8_be,s)
516    else
517        return s
518    end
519end
520
521local utf32_to_utf8_be_t = function(t)
522    if not t then
523        return nil
524    elseif type(t) == "string" then
525        t = lpegmatch(utf_32_be_linesplitter,t)
526    end
527    for i=1,#t do
528        local s = t[i]
529        if s ~= "" then
530            t[i] = lpegmatch(p_utf32_to_utf8_be,s)
531        end
532    end
533    return t
534end
535
536local utf32_to_utf8_le = function(s)
537    if s and s ~= "" then
538        return lpegmatch(p_utf32_to_utf8_le,s)
539    else
540        return s
541    end
542end
543
544local utf32_to_utf8_le_t = function(t)
545    if not t then
546        return nil
547    elseif type(t) == "string" then
548        t = lpegmatch(utf_32_le_linesplitter,t)
549    end
550    for i=1,#t do
551        local s = t[i]
552        if s ~= "" then
553            t[i] = lpegmatch(p_utf32_to_utf8_le,s)
554        end
555    end
556    return t
557end
558
559utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t
560utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t
561utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t
562utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t
563
564utf.utf16_to_utf8_le   = utf16_to_utf8_le
565utf.utf16_to_utf8_be   = utf16_to_utf8_be
566utf.utf32_to_utf8_le   = utf32_to_utf8_le
567utf.utf32_to_utf8_be   = utf32_to_utf8_be
568
569function utf.utf8_to_utf8_t(t)
570    return type(t) == "string" and lpegmatch(utflinesplitter,t) or t
571end
572
573function utf.utf16_to_utf8_t(t,endian)
574    return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t
575end
576
577function utf.utf32_to_utf8_t(t,endian)
578    return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t
579end
580
581do
582
583    local function little(b)
584        if b < 0x10000 then
585            return char(b%256,(b>>8))
586        else
587            b = b - 0x10000
588            local b1 = (b>>10) + 0xD800
589            local b2 = b%1024 + 0xDC00
590            return char(b1%256,(b1>>8),b2%256,(b2>>8))
591        end
592    end
593
594    local function big(b)
595        if b < 0x10000 then
596            return char((b>>8),b%256)
597        else
598            b = b - 0x10000
599            local b1 = (b>>10) + 0xD800
600            local b2 = b%1024 + 0xDC00
601            return char((b1>>8),b1%256,(b2>>8),b2%256)
602        end
603    end
604
605    local l_remap = Cs((p_utf8byte/little+P(1)/"")^0)
606    local b_remap = Cs((p_utf8byte/big   +P(1)/"")^0)
607
608    local function utf8_to_utf16_be(str,nobom)
609        if nobom then
610            return lpegmatch(b_remap,str)
611        else
612            return char(254,255) .. lpegmatch(b_remap,str)
613        end
614    end
615
616    local function utf8_to_utf16_le(str,nobom)
617        if nobom then
618            return lpegmatch(l_remap,str)
619        else
620            return char(255,254) .. lpegmatch(l_remap,str)
621        end
622    end
623
624    utf.utf8_to_utf16_be = utf8_to_utf16_be
625    utf.utf8_to_utf16_le = utf8_to_utf16_le
626
627    function utf.utf8_to_utf16(str,littleendian,nobom)
628        if littleendian then
629            return utf8_to_utf16_le(str,nobom)
630        else
631            return utf8_to_utf16_be(str,nobom)
632        end
633    end
634
635end
636
637local pattern = Cs (
638    (p_utf8byte           / function(unicode          ) return format(  "0x%04X",          unicode) end) *
639    (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0
640)
641
642function utf.tocodes(str,separator)
643    return lpegmatch(pattern,str,1,separator or " ")
644end
645
646function utf.ustring(s)
647    return format("U+%05X",type(s) == "number" and s or utfbyte(s))
648end
649
650function utf.xstring(s)
651    return format("0x%05X",type(s) == "number" and s or utfbyte(s))
652end
653
654function utf.toeight(str)
655    if not str or str == "" then
656        return nil
657    end
658    local utftype = lpegmatch(p_utfstricttype,str)
659    if utftype == "utf-8" then
660        return sub(str,4)               -- remove the bom
661    elseif utftype == "utf-16-be" then
662        return utf16_to_utf8_be(str)    -- bom gets removed
663    elseif utftype == "utf-16-le" then
664        return utf16_to_utf8_le(str)    -- bom gets removed
665    else
666        return str
667    end
668end
669
670do
671
672    local p_nany = p_utf8character / ""
673    local cache  = { }
674
675    function utf.count(str,what)
676        if type(what) == "string" then
677            local p = cache[what]
678            if not p then
679                p = Cs((P(what)/" " + p_nany)^0)
680                cache[p] = p
681            end
682            return #lpegmatch(p,str)
683        else -- 4 times slower but still faster than / function
684            return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str)
685        end
686    end
687
688end
689
690utf.values = string.utfvalues
691
692function utf.chrlen(u) -- u is number
693    return
694        (u < 0x80 and 1) or
695        (u < 0xE0 and 2) or
696        (u < 0xF0 and 3) or
697        (u < 0xF8 and 4) or
698        (u < 0xFC and 5) or
699        (u < 0xFE and 6) or 0
700end
701
702-- hashing saves a little but not that much in practice
703--
704-- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end)
705
706-- local function utf.toutf32string(n) -- unused, le or be ...
707--     return char(n&0xFF,(n>>8)&0xFF,(n>>16)&0xFF,(n>>24)&0xFF)
708-- end
709
710-- goodie:
711
712function string.utfpadd(s,n)
713    if n and n ~= 0 then
714        local l = utflength(s)
715        if n > 0 then
716            local d = n - l
717            if d > 0 then
718                return rep(c or " ",d) .. s
719            end
720        else
721            local d = - n - l
722            if d > 0 then
723                return s .. rep(c or " ",d)
724            end
725        end
726    end
727    return s
728end
729
730-- goodies
731
732do
733
734    lpeg.UP = P
735
736    function lpeg.US(str)
737        local p = P(false)
738        for uc in utfcharacters(str) do
739            p = p + P(uc)
740        end
741        return p
742    end
743
744    local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture
745
746    function lpeg.UR(str,more)
747        local first, last
748        if type(str) == "number" then
749            first = str
750            last = more or first
751        else
752            first, last = lpegmatch(range,str)
753            if not last then
754                return P(str)
755            end
756        end
757        if first == last then
758            return P(str)
759        end
760        if not utfchar then
761            utfchar = utf.char -- maybe delayed
762        end
763        if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium
764            local p = P(false)
765            for i=first,last do
766                p = p + P(utfchar(i))
767            end
768            return p -- nil when invalid range
769        else
770            local f = function(b)
771                return b >= first and b <= last
772            end
773            -- tricky, these nested captures
774            return p_utf8byte / f -- nil when invalid range
775        end
776    end
777
778    -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω"))
779
780end
781