l-lpeg.lua /size: 37 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['l-lpeg'] = {
2    version   = 1.001,
3    comment   = "companion to luat-lib.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- we can get too many captures (e.g. on largexml files) which makes me wonder
10-- if P(foo)/"" can't be simplfied to N(foo) i.e. some direct instruction to the
11-- lpeg virtual machine to ignore it
12
13-- lpeg 12 vs lpeg 10: slower compilation, similar parsing speed (i need to check
14-- if i can use new features like capture / 2 and .B (at first sight the xml
15-- parser is some 5% slower)
16
17-- lpeg.P("abc") is faster than lpeg.P("a") * lpeg.P("b") * lpeg.P("c")
18
19-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
20
21-- move utf    -> l-unicode
22-- move string -> l-string or keep it here
23
24-- lpeg.B                                 : backward without consumption
25-- lpeg.F = getmetatable(lpeg.P(1)).__len : forward  without consumption
26
27
28lpeg = require("lpeg") -- does lpeg register itself global?
29
30local lpeg = lpeg
31
32-- The latest lpeg doesn't have print any more, and even the new ones are not
33-- available by default (only when debug mode is enabled), which is a pitty as
34-- as it helps nailing down bottlenecks. Performance seems comparable: some 10%
35-- slower pattern compilation, same parsing speed, although,
36--
37-- local p = lpeg.C(lpeg.P(1)^0 * lpeg.P(-1))
38-- local a = string.rep("123",100)
39-- lpeg.match(p,a)
40--
41-- seems slower and is also still suboptimal (i.e. a match that runs from begin
42-- to end, one of the cases where string matchers win).
43
44if not lpeg.print then function lpeg.print(...) print(lpeg.pcode(...)) end end
45
46-- tracing (only used when we encounter a problem in integration of lpeg in luatex)
47
48-- some code will move to unicode and string
49
50-- local lpmatch = lpeg.match
51-- local lpprint = lpeg.print
52-- local lpp     = lpeg.P
53-- local lpr     = lpeg.R
54-- local lps     = lpeg.S
55-- local lpc     = lpeg.C
56-- local lpb     = lpeg.B
57-- local lpv     = lpeg.V
58-- local lpcf    = lpeg.Cf
59-- local lpcb    = lpeg.Cb
60-- local lpcg    = lpeg.Cg
61-- local lpct    = lpeg.Ct
62-- local lpcs    = lpeg.Cs
63-- local lpcc    = lpeg.Cc
64-- local lpcmt   = lpeg.Cmt
65-- local lpcarg  = lpeg.Carg
66
67-- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end
68
69-- function lpeg.P    (l) local p = lpp   (l) print("LPEG P =")    lpprint(l) return p end
70-- function lpeg.R    (l) local p = lpr   (l) print("LPEG R =")    lpprint(l) return p end
71-- function lpeg.S    (l) local p = lps   (l) print("LPEG S =")    lpprint(l) return p end
72-- function lpeg.C    (l) local p = lpc   (l) print("LPEG C =")    lpprint(l) return p end
73-- function lpeg.B    (l) local p = lpb   (l) print("LPEG B =")    lpprint(l) return p end
74-- function lpeg.V    (l) local p = lpv   (l) print("LPEG V =")    lpprint(l) return p end
75-- function lpeg.Cf   (l) local p = lpcf  (l) print("LPEG Cf =")   lpprint(l) return p end
76-- function lpeg.Cb   (l) local p = lpcb  (l) print("LPEG Cb =")   lpprint(l) return p end
77-- function lpeg.Cg   (l) local p = lpcg  (l) print("LPEG Cg =")   lpprint(l) return p end
78-- function lpeg.Ct   (l) local p = lpct  (l) print("LPEG Ct =")   lpprint(l) return p end
79-- function lpeg.Cs   (l) local p = lpcs  (l) print("LPEG Cs =")   lpprint(l) return p end
80-- function lpeg.Cc   (l) local p = lpcc  (l) print("LPEG Cc =")   lpprint(l) return p end
81-- function lpeg.Cmt  (l) local p = lpcmt (l) print("LPEG Cmt =")  lpprint(l) return p end
82-- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end
83
84local type, next, tostring = type, next, tostring
85local byte, char, gmatch, format = string.byte, string.char, string.gmatch, string.format
86----- mod, div = math.mod, math.div
87local floor = math.floor
88
89local P, R, S, V, Ct, C, Cs, Cc, Cp, Cmt = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.Ct, lpeg.C, lpeg.Cs, lpeg.Cc, lpeg.Cp, lpeg.Cmt
90local lpegtype, lpegmatch, lpegprint = lpeg.type, lpeg.match, lpeg.print
91
92-- let's start with an inspector:
93
94if setinspector then
95    setinspector("lpeg",function(v) if lpegtype(v) then lpegprint(v) return true end end)
96end
97
98-- Beware, we predefine a bunch of patterns here and one reason for doing so
99-- is that we get consistent behaviour in some of the visualizers.
100
101lpeg.patterns  = lpeg.patterns or { } -- so that we can share
102local patterns = lpeg.patterns
103
104local anything         = P(1)
105local endofstring      = P(-1)
106local alwaysmatched    = P(true)
107
108patterns.anything      = anything
109patterns.endofstring   = endofstring
110patterns.beginofstring = alwaysmatched
111patterns.alwaysmatched = alwaysmatched
112
113local sign             = S('+-')
114local zero             = P('0')
115local digit            = R('09')
116local digits           = digit^1
117local octdigit         = R("07")
118local octdigits        = octdigit^1
119local lowercase        = R("az")
120local uppercase        = R("AZ")
121local underscore       = P("_")
122local hexdigit         = digit + lowercase + uppercase
123local hexdigits        = hexdigit^1
124local cr, lf, crlf     = P("\r"), P("\n"), P("\r\n")
125----- newline          = crlf + S("\r\n") -- cr + lf
126local newline          = P("\r") * (P("\n") + P(true)) + P("\n")  -- P("\r")^-1 * P("\n")^-1
127local escaped          = P("\\") * anything
128local squote           = P("'")
129local dquote           = P('"')
130local space            = P(" ")
131local period           = P(".")
132local comma            = P(",")
133
134local utfbom_32_be     = P('\000\000\254\255') -- 00 00 FE FF
135local utfbom_32_le     = P('\255\254\000\000') -- FF FE 00 00
136local utfbom_16_be     = P('\254\255')         -- FE FF
137local utfbom_16_le     = P('\255\254')         -- FF FE
138local utfbom_8         = P('\239\187\191')     -- EF BB BF
139local utfbom           = utfbom_32_be + utfbom_32_le
140                       + utfbom_16_be + utfbom_16_le
141                       + utfbom_8
142local utftype          = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le  * Cc("utf-32-le")
143                       + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le  * Cc("utf-16-le")
144                       + utfbom_8     * Cc("utf-8")     + alwaysmatched * Cc("utf-8") -- assume utf8
145local utfstricttype    = utfbom_32_be * Cc("utf-32-be") + utfbom_32_le  * Cc("utf-32-le")
146                       + utfbom_16_be * Cc("utf-16-be") + utfbom_16_le  * Cc("utf-16-le")
147                       + utfbom_8     * Cc("utf-8")
148local utfoffset        = utfbom_32_be * Cc(4) + utfbom_32_le * Cc(4)
149                       + utfbom_16_be * Cc(2) + utfbom_16_le * Cc(2)
150                       + utfbom_8     * Cc(3) + Cc(0)
151
152local utf8next         = R("\128\191")
153
154patterns.utfbom_32_be  = utfbom_32_be
155patterns.utfbom_32_le  = utfbom_32_le
156patterns.utfbom_16_be  = utfbom_16_be
157patterns.utfbom_16_le  = utfbom_16_le
158patterns.utfbom_8      = utfbom_8
159
160patterns.utf_16_be_nl  = P("\000\r\000\n") + P("\000\r") + P("\000\n") -- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n")
161patterns.utf_16_le_nl  = P("\r\000\n\000") + P("\r\000") + P("\n\000") -- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000")
162
163patterns.utf_32_be_nl  = P("\000\000\000\r\000\000\000\n") + P("\000\000\000\r") + P("\000\000\000\n")
164patterns.utf_32_le_nl  = P("\r\000\000\000\n\000\000\000") + P("\r\000\000\000") + P("\n\000\000\000")
165
166patterns.utf8one       = R("\000\127")
167patterns.utf8two       = R("\194\223") * utf8next
168patterns.utf8three     = R("\224\239") * utf8next * utf8next
169patterns.utf8four      = R("\240\244") * utf8next * utf8next * utf8next
170patterns.utfbom        = utfbom
171patterns.utftype       = utftype
172patterns.utfstricttype = utfstricttype
173patterns.utfoffset     = utfoffset
174
175local utf8char         = patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four
176local validutf8char    = utf8char^0 * endofstring * Cc(true) + Cc(false)
177
178local utf8character    = P(1) * R("\128\191")^0 -- unchecked but fast
179
180patterns.utf8          = utf8char
181patterns.utf8char      = utf8char
182patterns.utf8character = utf8character -- this one can be used in most cases so we might use that one
183patterns.validutf8     = validutf8char
184patterns.validutf8char = validutf8char
185
186local eol              = S("\n\r")
187local spacer           = S(" \t\f\v")  -- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
188local whitespace       = eol + spacer
189local nonspacer        = 1 - spacer
190local nonwhitespace    = 1 - whitespace
191
192patterns.eol           = eol
193patterns.spacer        = spacer
194patterns.whitespace    = whitespace
195patterns.nonspacer     = nonspacer
196patterns.nonwhitespace = nonwhitespace
197
198local stripper         = spacer    ^0 * C((spacer    ^0 * nonspacer    ^1)^0)     -- from example by roberto
199local fullstripper     = whitespace^0 * C((whitespace^0 * nonwhitespace^1)^0)
200
201----- collapser        = Cs(spacer^0/"" * ((spacer^1 * endofstring / "") + (spacer^1/" ") + P(1))^0)
202local collapser        = Cs(spacer^0/"" * nonspacer^0 * ((spacer^0/" " * nonspacer^1)^0))
203local nospacer         = Cs((whitespace^1/"" + nonwhitespace^1)^0)
204
205local b_collapser      = Cs( whitespace^0              /"" * (nonwhitespace^1 + whitespace^1/" ")^0)
206local m_collapser      = Cs(                                 (nonwhitespace^1 + whitespace^1/" ")^0)
207local e_collapser      = Cs((whitespace^1 * endofstring/"" +  nonwhitespace^1 + whitespace^1/" ")^0)
208local x_collapser      = Cs(                                 (nonwhitespace^1 + whitespace^1/"" )^0)
209
210local b_stripper       = Cs( spacer^0              /"" * (nonspacer^1 + spacer^1/" ")^0)
211local m_stripper       = Cs(                             (nonspacer^1 + spacer^1/" ")^0)
212local e_stripper       = Cs((spacer^1 * endofstring/"" +  nonspacer^1 + spacer^1/" ")^0)
213local x_stripper       = Cs(                             (nonspacer^1 + spacer^1/"" )^0)
214
215patterns.stripper      = stripper
216patterns.fullstripper  = fullstripper
217patterns.collapser     = collapser
218patterns.nospacer      = nospacer
219
220patterns.b_collapser   = b_collapser
221patterns.m_collapser   = m_collapser
222patterns.e_collapser   = e_collapser
223patterns.x_collapser   = x_collapser
224
225patterns.b_stripper    = b_stripper
226patterns.m_stripper    = m_stripper
227patterns.e_stripper    = e_stripper
228patterns.x_stripper    = x_stripper
229
230patterns.lowercase     = lowercase
231patterns.uppercase     = uppercase
232patterns.letter        = patterns.lowercase + patterns.uppercase
233patterns.space         = space
234patterns.tab           = P("\t")
235patterns.spaceortab    = patterns.space + patterns.tab
236patterns.newline       = newline
237patterns.emptyline     = newline^1
238patterns.equal         = P("=")
239patterns.comma         = comma
240patterns.commaspacer   = comma * spacer^0
241patterns.period        = period
242patterns.colon         = P(":")
243patterns.semicolon     = P(";")
244patterns.underscore    = underscore
245patterns.escaped       = escaped
246patterns.squote        = squote
247patterns.dquote        = dquote
248patterns.nosquote      = (escaped + (1-squote))^0
249patterns.nodquote      = (escaped + (1-dquote))^0
250patterns.unsingle      = (squote/"") * patterns.nosquote * (squote/"") -- will change to C in the middle
251patterns.undouble      = (dquote/"") * patterns.nodquote * (dquote/"") -- will change to C in the middle
252patterns.unquoted      = patterns.undouble + patterns.unsingle -- more often undouble
253patterns.unspacer      = ((patterns.spacer^1)/"")^0
254
255patterns.singlequoted  = squote * patterns.nosquote * squote
256patterns.doublequoted  = dquote * patterns.nodquote * dquote
257patterns.quoted        = patterns.doublequoted + patterns.singlequoted
258
259patterns.digit         = digit
260patterns.digits        = digits
261patterns.octdigit      = octdigit
262patterns.octdigits     = octdigits
263patterns.hexdigit      = hexdigit
264patterns.hexdigits     = hexdigits
265patterns.sign          = sign
266patterns.cardinal      = digits
267patterns.integer       = sign^-1 * digits
268patterns.unsigned      = digit^0 * period * digits
269patterns.float         = sign^-1 * patterns.unsigned
270patterns.cunsigned     = digit^0 * comma * digits
271patterns.cpunsigned    = digit^0 * (period + comma) * digits
272patterns.cfloat        = sign^-1 * patterns.cunsigned
273patterns.cpfloat       = sign^-1 * patterns.cpunsigned
274patterns.number        = patterns.float + patterns.integer
275patterns.cnumber       = patterns.cfloat + patterns.integer
276patterns.cpnumber      = patterns.cpfloat + patterns.integer
277patterns.oct           = zero * octdigits -- hm is this ok
278patterns.octal         = patterns.oct
279patterns.HEX           = zero * P("X") * (digit+uppercase)^1
280patterns.hex           = zero * P("x") * (digit+lowercase)^1
281patterns.hexadecimal   = zero * S("xX") * hexdigits
282
283patterns.hexafloat     = sign^-1
284                       * zero * S("xX")
285                       * (hexdigit^0 * period * hexdigits + hexdigits * period * hexdigit^0 + hexdigits)
286                       * (S("pP") * sign^-1 * hexdigits)^-1
287patterns.decafloat     = sign^-1
288                       * (digit^0 * period * digits + digits * period * digit^0 + digits)
289                       *  S("eE") * sign^-1 * digits
290
291patterns.propername    = (uppercase + lowercase + underscore) * (uppercase + lowercase + underscore + digit)^0 * endofstring
292
293patterns.somecontent   = (anything - newline - space)^1 -- (utf8char - newline - space)^1
294patterns.beginline     = #(1-newline)
295
296patterns.longtostring  = Cs(whitespace^0/"" * ((patterns.quoted + nonwhitespace^1 + whitespace^1/"" * (endofstring + Cc(" ")))^0))
297
298-- local function anywhere(pattern) -- slightly adapted from website
299--     return P { P(pattern) + 1 * V(1) }
300-- end
301
302local function anywhere(pattern) -- faster
303    return (1-P(pattern))^0 * P(pattern)
304end
305
306lpeg.anywhere = anywhere
307
308function lpeg.instringchecker(p)
309    p = anywhere(p)
310    return function(str)
311        return lpegmatch(p,str) and true or false
312    end
313end
314
315-- function lpeg.splitter(pattern, action)
316--     return (((1-P(pattern))^1)/action+1)^0
317-- end
318
319-- function lpeg.tsplitter(pattern, action)
320--     return Ct((((1-P(pattern))^1)/action+1)^0)
321-- end
322
323function lpeg.splitter(pattern, action)
324    if action then
325        return (((1-P(pattern))^1)/action+1)^0
326    else
327        return (Cs((1-P(pattern))^1)+1)^0
328    end
329end
330
331function lpeg.tsplitter(pattern, action)
332    if action then
333        return Ct((((1-P(pattern))^1)/action+1)^0)
334    else
335        return Ct((Cs((1-P(pattern))^1)+1)^0)
336    end
337end
338
339-- probleem: separator can be lpeg and that does not hash too well, but
340-- it's quite okay as the key is then not garbage collected
341
342local splitters_s, splitters_m, splitters_t = { }, { }, { }
343
344local function splitat(separator,single)
345    local splitter = (single and splitters_s[separator]) or splitters_m[separator]
346    if not splitter then
347        separator = P(separator)
348        local other = C((1 - separator)^0)
349        if single then
350            local any = anything
351            splitter = other * (separator * C(any^0) + "") -- ?
352            splitters_s[separator] = splitter
353        else
354            splitter = other * (separator * other)^0
355            splitters_m[separator] = splitter
356        end
357    end
358    return splitter
359end
360
361local function tsplitat(separator)
362    local splitter = splitters_t[separator]
363    if not splitter then
364        splitter = Ct(splitat(separator))
365        splitters_t[separator] = splitter
366    end
367    return splitter
368end
369
370lpeg.splitat  = splitat
371lpeg.tsplitat = tsplitat
372
373function string.splitup(str,separator)
374    if not separator then
375        separator = ","
376    end
377    return lpegmatch(splitters_m[separator] or splitat(separator),str)
378end
379
380-- local p = splitat("->",false)  print(lpegmatch(p,"oeps->what->more"))  -- oeps what more
381-- local p = splitat("->",true)   print(lpegmatch(p,"oeps->what->more"))  -- oeps what->more
382-- local p = splitat("->",false)  print(lpegmatch(p,"oeps"))              -- oeps
383-- local p = splitat("->",true)   print(lpegmatch(p,"oeps"))              -- oeps
384
385local cache = { }
386
387function lpeg.split(separator,str)
388    local c = cache[separator]
389    if not c then
390        c = tsplitat(separator)
391        cache[separator] = c
392    end
393    return lpegmatch(c,str)
394end
395
396function string.split(str,separator)
397    if separator then
398        local c = cache[separator]
399        if not c then
400            c = tsplitat(separator)
401            cache[separator] = c
402        end
403        return lpegmatch(c,str)
404    else
405        return { str }
406    end
407end
408
409local spacing  = patterns.spacer^0 * newline -- sort of strip
410local empty    = spacing * Cc("")
411local nonempty = Cs((1-spacing)^1) * spacing^-1
412local content  = (empty + nonempty)^1
413
414patterns.textline = content
415
416local linesplitter = tsplitat(newline)
417
418patterns.linesplitter = linesplitter
419
420function string.splitlines(str)
421    return lpegmatch(linesplitter,str)
422end
423
424-- lpeg.splitters = cache -- no longer public
425
426local cache = { }
427
428function lpeg.checkedsplit(separator,str)
429    local c = cache[separator]
430    if not c then
431        separator = P(separator)
432        local other = C((1 - separator)^1)
433        c = Ct(separator^0 * other * (separator^1 * other)^0)
434        cache[separator] = c
435    end
436    return lpegmatch(c,str)
437end
438
439function string.checkedsplit(str,separator)
440    local c = cache[separator]
441    if not c then
442        separator = P(separator)
443        local other = C((1 - separator)^1)
444        c = Ct(separator^0 * other * (separator^1 * other)^0)
445        cache[separator] = c
446    end
447    return lpegmatch(c,str)
448end
449
450-- from roberto's site:
451
452local function f2(s) local c1, c2         = byte(s,1,2) return   c1 * 64 + c2                       -    12416 end
453local function f3(s) local c1, c2, c3     = byte(s,1,3) return  (c1 * 64 + c2) * 64 + c3            -   925824 end
454local function f4(s) local c1, c2, c3, c4 = byte(s,1,4) return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 end
455
456local utf8byte = patterns.utf8one/byte + patterns.utf8two/f2 + patterns.utf8three/f3 + patterns.utf8four/f4
457
458patterns.utf8byte = utf8byte
459
460--~ local str = " a b c d "
461
462--~ local s = lpeg.stripper(lpeg.R("az"))   print("["..lpegmatch(s,str).."]")
463--~ local s = lpeg.keeper(lpeg.R("az"))     print("["..lpegmatch(s,str).."]")
464--~ local s = lpeg.stripper("ab")           print("["..lpegmatch(s,str).."]")
465--~ local s = lpeg.keeper("ab")             print("["..lpegmatch(s,str).."]")
466
467local cache = { }
468
469function lpeg.stripper(str)
470    if type(str) == "string" then
471        local s = cache[str]
472        if not s then
473            s = Cs(((S(str)^1)/"" + 1)^0)
474            cache[str] = s
475        end
476        return s
477    else
478        return Cs(((str^1)/"" + 1)^0)
479    end
480end
481
482local cache = { }
483
484function lpeg.keeper(str)
485    if type(str) == "string" then
486        local s = cache[str]
487        if not s then
488            s = Cs((((1-S(str))^1)/"" + 1)^0)
489            cache[str] = s
490        end
491        return s
492    else
493        return Cs((((1-str)^1)/"" + 1)^0)
494    end
495end
496
497function lpeg.frontstripper(str) -- or pattern (yet undocumented)
498    return (P(str) + P(true)) * Cs(anything^0)
499end
500
501function lpeg.endstripper(str) -- or pattern (yet undocumented)
502    return Cs((1 - P(str) * endofstring)^0)
503end
504
505-- Just for fun I looked at the used bytecode and
506-- p = (p and p + pp) or pp gets one more (testset).
507
508-- todo: cache when string
509
510function lpeg.replacer(one,two,makefunction,isutf) -- in principle we should sort the keys but we have a better one anyway
511    local pattern
512    local u = isutf and utf8char or 1
513    if type(one) == "table" then
514        local no = #one
515        local p = P(false)
516        if no == 0 then
517            for k, v in next, one do
518                p = p + P(k) / v
519            end
520            pattern = Cs((p + u)^0)
521        elseif no == 1 then
522            local o = one[1]
523            one, two = P(o[1]), o[2]
524         -- pattern = Cs(((1-one)^1 + one/two)^0)
525            pattern = Cs((one/two + u)^0)
526        else
527            for i=1,no do
528                local o = one[i]
529                p = p + P(o[1]) / o[2]
530            end
531            pattern = Cs((p + u)^0)
532        end
533    else
534        pattern = Cs((P(one)/(two or "") + u)^0)
535    end
536    if makefunction then
537        return function(str)
538            return lpegmatch(pattern,str)
539        end
540    else
541        return pattern
542    end
543end
544
545-- local pattern1 = P(1-P(pattern))^0 * P(pattern)   : test for not nil
546-- local pattern2 = (P(pattern) * Cc(true) + P(1))^0 : test for true (could be faster, but not much)
547
548function lpeg.finder(lst,makefunction,isutf) -- beware: slower than find with 'patternless finds'
549    local pattern
550    if type(lst) == "table" then
551        pattern = P(false)
552        if #lst == 0 then
553            for k, v in next, lst do
554                pattern = pattern + P(k) -- ignore key, so we can use a replacer table
555            end
556        else
557            for i=1,#lst do
558                pattern = pattern + P(lst[i])
559            end
560        end
561    else
562        pattern = P(lst)
563    end
564    if isutf then
565        pattern = ((utf8char or 1)-pattern)^0 * pattern
566    else
567        pattern = (1-pattern)^0 * pattern
568    end
569    if makefunction then
570        return function(str)
571            return lpegmatch(pattern,str)
572        end
573    else
574        return pattern
575    end
576end
577
578-- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
579-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
580-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
581
582local splitters_f, splitters_s = { }, { }
583
584function lpeg.firstofsplit(separator) -- always return value
585    local splitter = splitters_f[separator]
586    if not splitter then
587        local pattern = P(separator)
588        splitter = C((1 - pattern)^0)
589        splitters_f[separator] = splitter
590    end
591    return splitter
592end
593
594function lpeg.secondofsplit(separator) -- nil if not split
595    local splitter = splitters_s[separator]
596    if not splitter then
597        local pattern = P(separator)
598        splitter = (1 - pattern)^0 * pattern * C(anything^0)
599        splitters_s[separator] = splitter
600    end
601    return splitter
602end
603
604local splitters_s, splitters_p = { }, { }
605
606function lpeg.beforesuffix(separator) -- nil if nothing but empty is ok
607    local splitter = splitters_s[separator]
608    if not splitter then
609        local pattern = P(separator)
610        splitter = C((1 - pattern)^0) * pattern * endofstring
611        splitters_s[separator] = splitter
612    end
613    return splitter
614end
615
616function lpeg.afterprefix(separator) -- nil if nothing but empty is ok
617    local splitter = splitters_p[separator]
618    if not splitter then
619        local pattern = P(separator)
620        splitter = pattern * C(anything^0)
621        splitters_p[separator] = splitter
622    end
623    return splitter
624end
625
626function lpeg.balancer(left,right)
627    left, right = P(left), P(right)
628    return P { left * ((1 - left - right) + V(1))^0 * right }
629end
630
631-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
632-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
633-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
634-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
635-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
636-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
637-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
638-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
639
640-- this was slower but lpeg has been sped up in the meantime, so we no longer
641-- use this (still seems somewhat faster on long strings)
642--
643-- local nany = utf8char/""
644--
645-- function lpeg.counter(pattern)
646--     pattern = Cs((P(pattern)/" " + nany)^0)
647--     return function(str)
648--         return #lpegmatch(pattern,str)
649--     end
650-- end
651
652function lpeg.counter(pattern,action)
653    local n       = 0
654    local pattern = (P(pattern) / function() n = n + 1 end + anything)^0
655    ----- pattern = (P(pattern) * (P(true) / function() n = n + 1 end) + anything)^0
656    ----- pattern = (P(pattern) * P(function() n = n + 1 end) + anything)^0
657    if action then
658        return function(str) n = 0 ; lpegmatch(pattern,str) ; action(n) end
659    else
660        return function(str) n = 0 ; lpegmatch(pattern,str) ; return n end
661    end
662end
663
664-- lpeg.print(lpeg.R("ab","cd","gh"))
665-- lpeg.print(lpeg.P("a","b","c"))
666-- lpeg.print(lpeg.S("a","b","c"))
667
668-- print(lpeg.counter(lpeg.P("á") + lpeg.P("à"))("äáàa"))
669-- print(lpeg.counter(lpeg.UP("áà"))("äáàa"))
670-- print(lpeg.counter(lpeg.US("àá"))("äáàa"))
671-- print(lpeg.counter(lpeg.UR("aá"))("äáàa"))
672-- print(lpeg.counter(lpeg.UR("àá"))("äáàa"))
673-- print(lpeg.counter(lpeg.UR(0x0000,0xFFFF)))
674
675function lpeg.is_lpeg(p)
676    return p and lpegtype(p) == "pattern"
677end
678
679function lpeg.oneof(list,...) -- lpeg.oneof("elseif","else","if","then") -- assume proper order
680    if type(list) ~= "table" then
681        list = { list, ... }
682    end
683 -- table.sort(list) -- longest match first
684    local p = P(list[1])
685    for l=2,#list do
686        p = p + P(list[l])
687    end
688    return p
689end
690
691-- For the moment here, but it might move to utilities. Beware, we need to
692-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
693-- loop back from the end cq. prepend.
694
695local sort = table.sort
696
697local function copyindexed(old)
698    local new = { }
699    for i=1,#old do
700        new[i] = old
701    end
702    return new
703end
704
705local function sortedkeys(tab)
706    local keys, s = { }, 0
707    for key,_ in next, tab do
708        s = s + 1
709        keys[s] = key
710    end
711    sort(keys)
712    return keys
713end
714
715function lpeg.append(list,pp,delayed,checked)
716    local p = pp
717    if #list > 0 then
718        local keys = copyindexed(list)
719        sort(keys)
720        for i=#keys,1,-1 do
721            local k = keys[i]
722            if p then
723                p = P(k) + p
724            else
725                p = P(k)
726            end
727        end
728    elseif delayed then -- hm, it looks like the lpeg parser resolves anyway
729        local keys = sortedkeys(list)
730        if p then
731            for i=1,#keys,1 do
732                local k = keys[i]
733                local v = list[k]
734                p = P(k)/list + p
735            end
736        else
737            for i=1,#keys do
738                local k = keys[i]
739                local v = list[k]
740                if p then
741                    p = P(k) + p
742                else
743                    p = P(k)
744                end
745            end
746            if p then
747                p = p / list
748            end
749        end
750    elseif checked then
751        -- problem: substitution gives a capture
752        local keys = sortedkeys(list)
753        for i=1,#keys do
754            local k = keys[i]
755            local v = list[k]
756            if p then
757                if k == v then
758                    p = P(k) + p
759                else
760                    p = P(k)/v + p
761                end
762            else
763                if k == v then
764                    p = P(k)
765                else
766                    p = P(k)/v
767                end
768            end
769        end
770    else
771        local keys = sortedkeys(list)
772        for i=1,#keys do
773            local k = keys[i]
774            local v = list[k]
775            if p then
776                p = P(k)/v + p
777            else
778                p = P(k)/v
779            end
780        end
781    end
782    return p
783end
784
785-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true))
786-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true))
787
788-- function lpeg.exact_match(words,case_insensitive)
789--     local pattern = concat(words)
790--     if case_insensitive then
791--         local pattern = S(upper(characters)) + S(lower(characters))
792--         local list = { }
793--         for i=1,#words do
794--             list[lower(words[i])] = true
795--         end
796--         return Cmt(pattern^1, function(_,i,s)
797--             return list[lower(s)] and i
798--         end)
799--     else
800--         local pattern = S(concat(words))
801--         local list = { }
802--         for i=1,#words do
803--             list[words[i]] = true
804--         end
805--         return Cmt(pattern^1, function(_,i,s)
806--             return list[s] and i
807--         end)
808--     end
809-- end
810
811-- experiment:
812
813local p_false = P(false)
814local p_true  = P(true)
815
816-- local function collapse(t,x)
817--     if type(t) ~= "table" then
818--         return t, x
819--     else
820--         local n = next(t)
821--         if n == nil then
822--             return t, x
823--         elseif next(t,n) == nil then
824--             -- one entry
825--             local k = n
826--             local v = t[k]
827--             if type(v) == "table" then
828--                 return collapse(v,x..k)
829--             else
830--                 return v, x .. k
831--             end
832--         else
833--             local tt = { }
834--             for k, v in next, t do
835--                 local vv, kk = collapse(v,k)
836--                 tt[kk] = vv
837--             end
838--             return tt, x
839--         end
840--     end
841-- end
842
843local lower = utf and utf.lower or string.lower
844local upper = utf and utf.upper or string.upper
845
846function lpeg.setutfcasers(l,u)
847    lower = l or lower
848    upper = u or upper
849end
850
851local function make1(t,rest)
852    local p    = p_false
853    local keys = sortedkeys(t)
854    for i=1,#keys do
855        local k = keys[i]
856        if k ~= "" then
857            local v = t[k]
858            if v == true then
859                p = p + P(k) * p_true
860            elseif v == false then
861                -- can't happen
862            else
863                p = p + P(k) * make1(v,v[""])
864            end
865        end
866    end
867    if rest then
868        p = p + p_true
869    end
870    return p
871end
872
873local function make2(t,rest) -- only ascii
874    local p    = p_false
875    local keys = sortedkeys(t)
876    for i=1,#keys do
877        local k = keys[i]
878        if k ~= "" then
879            local v = t[k]
880            if v == true then
881                p = p + (P(lower(k))+P(upper(k))) * p_true
882            elseif v == false then
883                -- can't happen
884            else
885                p = p + (P(lower(k))+P(upper(k))) * make2(v,v[""])
886            end
887        end
888    end
889    if rest then
890        p = p + p_true
891    end
892    return p
893end
894
895local function utfchartabletopattern(list,insensitive) -- goes to util-lpg
896    local tree = { }
897    local n = #list
898    if n == 0 then
899        for s in next, list do
900            local t = tree
901            local p, pk
902            for c in gmatch(s,".") do
903                if t == true then
904                    t = { [c] = true, [""] = true }
905                    p[pk] = t
906                    p = t
907                    t = false
908                elseif t == false then
909                    t = { [c] = false }
910                    p[pk] = t
911                    p = t
912                    t = false
913                else
914                    local tc = t[c]
915                    if not tc then
916                        tc = false
917                        t[c] = false
918                    end
919                    p = t
920                    t = tc
921                end
922                pk = c
923            end
924            if t == false then
925                p[pk] = true
926            elseif t == true then
927                -- okay
928            else
929                t[""] = true
930            end
931        end
932    else
933        for i=1,n do
934            local s = list[i]
935            local t = tree
936            local p, pk
937            for c in gmatch(s,".") do
938                if t == true then
939                    t = { [c] = true, [""] = true }
940                    p[pk] = t
941                    p = t
942                    t = false
943                elseif t == false then
944                    t = { [c] = false }
945                    p[pk] = t
946                    p = t
947                    t = false
948                else
949                    local tc = t[c]
950                    if not tc then
951                        tc = false
952                        t[c] = false
953                    end
954                    p = t
955                    t = tc
956                end
957                pk = c
958            end
959            if t == false then
960                p[pk] = true
961            elseif t == true then
962                -- okay
963            else
964                t[""] = true
965            end
966        end
967    end
968 -- collapse(tree,"") -- needs testing, maybe optional, slightly faster because P("x")*P("X") seems slower than P"(xX") (why)
969 -- inspect(tree)
970    return (insensitive and make2 or make1)(tree)
971end
972
973lpeg.utfchartabletopattern = utfchartabletopattern
974
975function lpeg.utfreplacer(list,insensitive)
976    local pattern = Cs((utfchartabletopattern(list,insensitive)/list + utf8character)^0)
977    return function(str)
978        return lpegmatch(pattern,str) or str
979    end
980end
981
982-- local t = { "start", "stoep", "staart", "paard" }
983-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
984
985-- local t = { "a", "abc", "ac", "abe", "abxyz", "xy", "bef","aa" }
986-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
987
988-- inspect(lpegmatch(p,"a")=="A")
989-- inspect(lpegmatch(p,"aa")=="AA")
990-- inspect(lpegmatch(p,"aaaa")=="AAAA")
991-- inspect(lpegmatch(p,"ac")=="AC")
992-- inspect(lpegmatch(p,"bc")=="bc")
993-- inspect(lpegmatch(p,"zzbczz")=="zzbczz")
994-- inspect(lpegmatch(p,"zzabezz")=="zzABEzz")
995-- inspect(lpegmatch(p,"ab")=="Ab")
996-- inspect(lpegmatch(p,"abc")=="ABC")
997-- inspect(lpegmatch(p,"abe")=="ABE")
998-- inspect(lpegmatch(p,"xa")=="xA")
999-- inspect(lpegmatch(p,"bx")=="bx")
1000-- inspect(lpegmatch(p,"bax")=="bAx")
1001-- inspect(lpegmatch(p,"abxyz")=="ABXYZ")
1002-- inspect(lpegmatch(p,"foobarbefcrap")=="foobArBEFcrAp")
1003
1004-- local t = { ["^"] = 1, ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
1005-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
1006-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
1007
1008-- local t = { ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
1009-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
1010-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
1011
1012-- lpeg.utfchartabletopattern {
1013--     utfchar(0x00A0), -- nbsp
1014--     utfchar(0x2000), -- enquad
1015--     utfchar(0x2001), -- emquad
1016--     utfchar(0x2002), -- enspace
1017--     utfchar(0x2003), -- emspace
1018--     utfchar(0x2004), -- threeperemspace
1019--     utfchar(0x2005), -- fourperemspace
1020--     utfchar(0x2006), -- sixperemspace
1021--     utfchar(0x2007), -- figurespace
1022--     utfchar(0x2008), -- punctuationspace
1023--     utfchar(0x2009), -- breakablethinspace
1024--     utfchar(0x200A), -- hairspace
1025--     utfchar(0x200B), -- zerowidthspace
1026--     utfchar(0x202F), -- narrownobreakspace
1027--     utfchar(0x205F), -- math thinspace
1028-- }
1029
1030-- a few handy ones:
1031--
1032-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3
1033
1034patterns.containseol = lpeg.finder(eol) -- (1-eol)^0 * eol
1035
1036-- The next pattern^n variant is based on an approach suggested
1037-- by Roberto: constructing a big repetition in chunks.
1038--
1039-- Being sparse is not needed, and only complicate matters and
1040-- the number of redundant entries is not that large.
1041
1042local function nextstep(n,step,result)
1043    local m = n % step      -- mod(n,step)
1044    local d = floor(n/step) -- div(n,step)
1045    if d > 0 then
1046        local v = V(tostring(step))
1047        local s = result.start
1048        for i=1,d do
1049            if s then
1050                s = v * s
1051            else
1052                s = v
1053            end
1054        end
1055        result.start = s
1056    end
1057    if step > 1 and result.start then
1058        local v = V(tostring(step/2))
1059        result[tostring(step)] = v * v
1060    end
1061    if step > 0 then
1062        return nextstep(m,step/2,result)
1063    else
1064        return result
1065    end
1066end
1067
1068function lpeg.times(pattern,n)
1069    return P(nextstep(n,2^16,{ "start", ["1"] = pattern }))
1070end
1071
1072-- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1)
1073-- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56"
1074-- inspect(p)
1075-- print(lpeg.match(p,s))
1076
1077-- moved here (before util-str)
1078
1079do
1080
1081    local trailingzeros = zero^0 * -digit -- suggested by Roberto
1082    local stripper      = Cs((
1083        digits * (
1084            period * trailingzeros / ""
1085          + period * (digit - trailingzeros)^1 * (trailingzeros / "")
1086        ) + 1
1087    )^0)
1088
1089    lpeg.patterns.stripzeros = stripper -- multiple in string
1090
1091    local nonzero       = digit - zero
1092    local trailingzeros = zero^1 * endofstring
1093    local stripper      = Cs( (1-period)^0 * (
1094        period *               trailingzeros/""
1095      + period * (nonzero^1 + (trailingzeros/"") + zero^1)^0
1096      + endofstring
1097    ))
1098
1099    lpeg.patterns.stripzero  = stripper -- slightly more efficient but expects a float !
1100
1101    -- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100"
1102    -- collectgarbage("collect")
1103    -- str = string.rep(sample,10000)
1104    -- local ts = os.clock()
1105    -- lpegmatch(stripper,str)
1106    -- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
1107
1108end
1109
1110-- for practical reasons we keep this here:
1111
1112local byte_to_HEX = { }
1113local byte_to_hex = { }
1114local byte_to_dec = { } -- for md5
1115local hex_to_byte = { }
1116
1117for i=0,255 do
1118    local H = format("%02X",i)
1119    local h = format("%02x",i)
1120    local d = format("%03i",i)
1121    local c = char(i)
1122    byte_to_HEX[c] = H
1123    byte_to_hex[c] = h
1124    byte_to_dec[c] = d
1125    hex_to_byte[h] = c
1126    hex_to_byte[H] = c
1127end
1128
1129local hextobyte  = P(2)/hex_to_byte
1130local bytetoHEX  = P(1)/byte_to_HEX
1131local bytetohex  = P(1)/byte_to_hex
1132local bytetodec  = P(1)/byte_to_dec
1133local hextobytes = Cs(hextobyte^0)
1134local bytestoHEX = Cs(bytetoHEX^0)
1135local bytestohex = Cs(bytetohex^0)
1136local bytestodec = Cs(bytetodec^0)
1137
1138patterns.hextobyte  = hextobyte
1139patterns.bytetoHEX  = bytetoHEX
1140patterns.bytetohex  = bytetohex
1141patterns.bytetodec  = bytetodec
1142patterns.hextobytes = hextobytes
1143patterns.bytestoHEX = bytestoHEX
1144patterns.bytestohex = bytestohex
1145patterns.bytestodec = bytestodec
1146
1147function string.toHEX(s)
1148    if not s or s == "" then
1149        return s
1150    else
1151        return lpegmatch(bytestoHEX,s)
1152    end
1153end
1154
1155function string.tohex(s)
1156    if not s or s == "" then
1157        return s
1158    else
1159        return lpegmatch(bytestohex,s)
1160    end
1161end
1162
1163function string.todec(s)
1164    if not s or s == "" then
1165        return s
1166    else
1167        return lpegmatch(bytestodec,s)
1168    end
1169end
1170
1171function string.tobytes(s)
1172    if not s or s == "" then
1173        return s
1174    else
1175        return lpegmatch(hextobytes,s)
1176    end
1177end
1178
1179-- local h = "ADFE0345"
1180-- local b = lpegmatch(patterns.hextobytes,h)
1181-- print(h,b,string.tohex(b),string.toHEX(b))
1182
1183local patterns = { } -- can be made weak
1184
1185local function containsws(what)
1186    local p = patterns[what]
1187    if not p then
1188        local p1 = P(what) * (whitespace + endofstring) * Cc(true)
1189        local p2 = whitespace * P(p1)
1190        p = P(p1) + P(1-p2)^0 * p2 + Cc(false)
1191        patterns[what] = p
1192    end
1193    return p
1194end
1195
1196lpeg.containsws = containsws
1197
1198function string.containsws(str,what)
1199    return lpegmatch(patterns[what] or containsws(what),str)
1200end
1201