l-lpeg.lua /size: 37 Kb    last modification: 2021-10-28 13:50
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
l-lpeg
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
companion to luat-lib.mkiv
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
-- we can get too many captures (e.g. on largexml files) which makes me wonder
10
-- if P(foo)/"" can't be simplfied to N(foo) i.e. some direct instruction to the
11
-- lpeg virtual machine to ignore it
12 13
-- lpeg 12 vs lpeg 10: slower compilation, similar parsing speed (i need to check
14
-- if i can use new features like capture / 2 and .B (at first sight the xml
15
-- parser is some 5% slower)
16 17
-- lpeg.P("abc") is faster than lpeg.P("a") * lpeg.P("b") * lpeg.P("c")
18 19
-- a new lpeg fails on a #(1-P(":")) test and really needs a + P(-1)
20 21
-- move utf -> l-unicode
22
-- move string -> l-string or keep it here
23 24
-- lpeg.B : backward without consumption
25
-- lpeg.F = getmetatable(lpeg.P(1)).__len : forward without consumption
26 27 28
lpeg
=
require
(
"
lpeg
"
)
-- does lpeg register itself global?
29 30
local
lpeg
=
lpeg
31 32
-- The latest lpeg doesn't have print any more, and even the new ones are not
33
-- available by default (only when debug mode is enabled), which is a pitty as
34
-- as it helps nailing down bottlenecks. Performance seems comparable: some 10%
35
-- slower pattern compilation, same parsing speed, although,
36
--
37
-- local p = lpeg.C(lpeg.P(1)^0 * lpeg.P(-1))
38
-- local a = string.rep("123",100)
39
-- lpeg.match(p,a)
40
--
41
-- seems slower and is also still suboptimal (i.e. a match that runs from begin
42
-- to end, one of the cases where string matchers win).
43 44
if
not
lpeg
.
print
then
function
lpeg
.
print
(
...
)
print
(
lpeg
.
pcode
(
...
)
)
end
end
45 46
-- tracing (only used when we encounter a problem in integration of lpeg in luatex)
47 48
-- some code will move to unicode and string
49 50
-- local lpmatch = lpeg.match
51
-- local lpprint = lpeg.print
52
-- local lpp = lpeg.P
53
-- local lpr = lpeg.R
54
-- local lps = lpeg.S
55
-- local lpc = lpeg.C
56
-- local lpb = lpeg.B
57
-- local lpv = lpeg.V
58
-- local lpcf = lpeg.Cf
59
-- local lpcb = lpeg.Cb
60
-- local lpcg = lpeg.Cg
61
-- local lpct = lpeg.Ct
62
-- local lpcs = lpeg.Cs
63
-- local lpcc = lpeg.Cc
64
-- local lpcmt = lpeg.Cmt
65
-- local lpcarg = lpeg.Carg
66 67
-- function lpeg.match(l,...) print("LPEG MATCH") lpprint(l) return lpmatch(l,...) end
68 69
-- function lpeg.P (l) local p = lpp (l) print("LPEG P =") lpprint(l) return p end
70
-- function lpeg.R (l) local p = lpr (l) print("LPEG R =") lpprint(l) return p end
71
-- function lpeg.S (l) local p = lps (l) print("LPEG S =") lpprint(l) return p end
72
-- function lpeg.C (l) local p = lpc (l) print("LPEG C =") lpprint(l) return p end
73
-- function lpeg.B (l) local p = lpb (l) print("LPEG B =") lpprint(l) return p end
74
-- function lpeg.V (l) local p = lpv (l) print("LPEG V =") lpprint(l) return p end
75
-- function lpeg.Cf (l) local p = lpcf (l) print("LPEG Cf =") lpprint(l) return p end
76
-- function lpeg.Cb (l) local p = lpcb (l) print("LPEG Cb =") lpprint(l) return p end
77
-- function lpeg.Cg (l) local p = lpcg (l) print("LPEG Cg =") lpprint(l) return p end
78
-- function lpeg.Ct (l) local p = lpct (l) print("LPEG Ct =") lpprint(l) return p end
79
-- function lpeg.Cs (l) local p = lpcs (l) print("LPEG Cs =") lpprint(l) return p end
80
-- function lpeg.Cc (l) local p = lpcc (l) print("LPEG Cc =") lpprint(l) return p end
81
-- function lpeg.Cmt (l) local p = lpcmt (l) print("LPEG Cmt =") lpprint(l) return p end
82
-- function lpeg.Carg (l) local p = lpcarg(l) print("LPEG Carg =") lpprint(l) return p end
83 84
local
type
,
next
,
tostring
=
type
,
next
,
tostring
85
local
byte
,
char
,
gmatch
,
format
=
string
.
byte
,
string
.
char
,
string
.
gmatch
,
string
.
format
86
----- mod, div = math.mod, math.div
87
local
floor
=
math
.
floor
88 89
local
P
,
R
,
S
,
V
,
Ct
,
C
,
Cs
,
Cc
,
Cp
,
Cmt
=
lpeg
.
P
,
lpeg
.
R
,
lpeg
.
S
,
lpeg
.
V
,
lpeg
.
Ct
,
lpeg
.
C
,
lpeg
.
Cs
,
lpeg
.
Cc
,
lpeg
.
Cp
,
lpeg
.
Cmt
90
local
lpegtype
,
lpegmatch
,
lpegprint
=
lpeg
.
type
,
lpeg
.
match
,
lpeg
.
print
91 92
-- let's start with an inspector:
93 94
if
setinspector
then
95
setinspector
(
"
lpeg
"
,
function
(
v
)
if
lpegtype
(
v
)
then
lpegprint
(
v
)
return
true
end
end
)
96
end
97 98
-- Beware, we predefine a bunch of patterns here and one reason for doing so
99
-- is that we get consistent behaviour in some of the visualizers.
100 101
lpeg
.
patterns
=
lpeg
.
patterns
or
{
}
-- so that we can share
102
local
patterns
=
lpeg
.
patterns
103 104
local
anything
=
P
(
1
)
105
local
endofstring
=
P
(
-1
)
106
local
alwaysmatched
=
P
(
true
)
107 108
patterns
.
anything
=
anything
109
patterns
.
endofstring
=
endofstring
110
patterns
.
beginofstring
=
alwaysmatched
111
patterns
.
alwaysmatched
=
alwaysmatched
112 113
local
sign
=
S
(
'
+-
'
)
114
local
zero
=
P
(
'
0
'
)
115
local
digit
=
R
(
'
09
'
)
116
local
digits
=
digit
^
1
117
local
octdigit
=
R
(
"
07
"
)
118
local
octdigits
=
octdigit
^
1
119
local
lowercase
=
R
(
"
az
"
)
120
local
uppercase
=
R
(
"
AZ
"
)
121
local
underscore
=
P
(
"
_
"
)
122
local
hexdigit
=
digit
+
lowercase
+
uppercase
123
local
hexdigits
=
hexdigit
^
1
124
local
cr
,
lf
,
crlf
=
P
(
"
\r
"
)
,
P
(
"
\n
"
)
,
P
(
"
\r\n
"
)
125
----- newline = crlf + S("\r\n") -- cr + lf
126
local
newline
=
P
(
"
\r
"
)
*
(
P
(
"
\n
"
)
+
P
(
true
)
)
+
P
(
"
\n
"
)
-- P("\r")^-1 * P("\n")^-1
127
local
escaped
=
P
(
"
\\
"
)
*
anything
128
local
squote
=
P
(
"
'
"
)
129
local
dquote
=
P
(
'
"
'
)
130
local
space
=
P
(
"
"
)
131
local
period
=
P
(
"
.
"
)
132
local
comma
=
P
(
"
,
"
)
133 134
local
utfbom_32_be
=
P
(
'
\000\000\254\255
'
)
-- 00 00 FE FF
135
local
utfbom_32_le
=
P
(
'
\255\254\000\000
'
)
-- FF FE 00 00
136
local
utfbom_16_be
=
P
(
'
\254\255
'
)
-- FE FF
137
local
utfbom_16_le
=
P
(
'
\255\254
'
)
-- FF FE
138
local
utfbom_8
=
P
(
'
\239\187\191
'
)
-- EF BB BF
139
local
utfbom
=
utfbom_32_be
+
utfbom_32_le
140
+
utfbom_16_be
+
utfbom_16_le
141
+
utfbom_8
142
local
utftype
=
utfbom_32_be
*
Cc
(
"
utf-32-be
"
)
+
utfbom_32_le
*
Cc
(
"
utf-32-le
"
)
143
+
utfbom_16_be
*
Cc
(
"
utf-16-be
"
)
+
utfbom_16_le
*
Cc
(
"
utf-16-le
"
)
144
+
utfbom_8
*
Cc
(
"
utf-8
"
)
+
alwaysmatched
*
Cc
(
"
utf-8
"
)
-- assume utf8
145
local
utfstricttype
=
utfbom_32_be
*
Cc
(
"
utf-32-be
"
)
+
utfbom_32_le
*
Cc
(
"
utf-32-le
"
)
146
+
utfbom_16_be
*
Cc
(
"
utf-16-be
"
)
+
utfbom_16_le
*
Cc
(
"
utf-16-le
"
)
147
+
utfbom_8
*
Cc
(
"
utf-8
"
)
148
local
utfoffset
=
utfbom_32_be
*
Cc
(
4
)
+
utfbom_32_le
*
Cc
(
4
)
149
+
utfbom_16_be
*
Cc
(
2
)
+
utfbom_16_le
*
Cc
(
2
)
150
+
utfbom_8
*
Cc
(
3
)
+
Cc
(
0
)
151 152
local
utf8next
=
R
(
"
\128\191
"
)
153 154
patterns
.
utfbom_32_be
=
utfbom_32_be
155
patterns
.
utfbom_32_le
=
utfbom_32_le
156
patterns
.
utfbom_16_be
=
utfbom_16_be
157
patterns
.
utfbom_16_le
=
utfbom_16_le
158
patterns
.
utfbom_8
=
utfbom_8
159 160
patterns
.
utf_16_be_nl
=
P
(
"
\000\r\000\n
"
)
+
P
(
"
\000\r
"
)
+
P
(
"
\000\n
"
)
-- P("\000\r") * (P("\000\n") + P(true)) + P("\000\n")
161
patterns
.
utf_16_le_nl
=
P
(
"
\r\000\n\000
"
)
+
P
(
"
\r\000
"
)
+
P
(
"
\n\000
"
)
-- P("\r\000") * (P("\n\000") + P(true)) + P("\n\000")
162 163
patterns
.
utf_32_be_nl
=
P
(
"
\000\000\000\r\000\000\000\n
"
)
+
P
(
"
\000\000\000\r
"
)
+
P
(
"
\000\000\000\n
"
)
164
patterns
.
utf_32_le_nl
=
P
(
"
\r\000\000\000\n\000\000\000
"
)
+
P
(
"
\r\000\000\000
"
)
+
P
(
"
\n\000\000\000
"
)
165 166
patterns
.
utf8one
=
R
(
"
\000\127
"
)
167
patterns
.
utf8two
=
R
(
"
\194\223
"
)
*
utf8next
168
patterns
.
utf8three
=
R
(
"
\224\239
"
)
*
utf8next
*
utf8next
169
patterns
.
utf8four
=
R
(
"
\240\244
"
)
*
utf8next
*
utf8next
*
utf8next
170
patterns
.
utfbom
=
utfbom
171
patterns
.
utftype
=
utftype
172
patterns
.
utfstricttype
=
utfstricttype
173
patterns
.
utfoffset
=
utfoffset
174 175
local
utf8char
=
patterns
.
utf8one
+
patterns
.
utf8two
+
patterns
.
utf8three
+
patterns
.
utf8four
176
local
validutf8char
=
utf8char
^
0
*
endofstring
*
Cc
(
true
)
+
Cc
(
false
)
177 178
local
utf8character
=
P
(
1
)
*
R
(
"
\128\191
"
)
^
0
-- unchecked but fast
179 180
patterns
.
utf8
=
utf8char
181
patterns
.
utf8char
=
utf8char
182
patterns
.
utf8character
=
utf8character
-- this one can be used in most cases so we might use that one
183
patterns
.
validutf8
=
validutf8char
184
patterns
.
validutf8char
=
validutf8char
185 186
local
eol
=
S
(
"
\n\r
"
)
187
local
spacer
=
S
(
"
\t\f\v
"
)
-- + char(0xc2, 0xa0) if we want utf (cf mail roberto)
188
local
whitespace
=
eol
+
spacer
189
local
nonspacer
=
1
-
spacer
190
local
nonwhitespace
=
1
-
whitespace
191 192
patterns
.
eol
=
eol
193
patterns
.
spacer
=
spacer
194
patterns
.
whitespace
=
whitespace
195
patterns
.
nonspacer
=
nonspacer
196
patterns
.
nonwhitespace
=
nonwhitespace
197 198
local
stripper
=
spacer
^
0
*
C
(
(
spacer
^
0
*
nonspacer
^
1
)
^
0
)
-- from example by roberto
199
local
fullstripper
=
whitespace
^
0
*
C
(
(
whitespace
^
0
*
nonwhitespace
^
1
)
^
0
)
200 201
----- collapser = Cs(spacer^0/"" * ((spacer^1 * endofstring / "") + (spacer^1/" ") + P(1))^0)
202
local
collapser
=
Cs
(
spacer
^
0
/
"
"
*
nonspacer
^
0
*
(
(
spacer
^
0
/
"
"
*
nonspacer
^
1
)
^
0
)
)
203
local
nospacer
=
Cs
(
(
whitespace
^
1
/
"
"
+
nonwhitespace
^
1
)
^
0
)
204 205
local
b_collapser
=
Cs
(
whitespace
^
0
/
"
"
*
(
nonwhitespace
^
1
+
whitespace
^
1
/
"
"
)
^
0
)
206
local
m_collapser
=
Cs
(
(
nonwhitespace
^
1
+
whitespace
^
1
/
"
"
)
^
0
)
207
local
e_collapser
=
Cs
(
(
whitespace
^
1
*
endofstring
/
"
"
+
nonwhitespace
^
1
+
whitespace
^
1
/
"
"
)
^
0
)
208
local
x_collapser
=
Cs
(
(
nonwhitespace
^
1
+
whitespace
^
1
/
"
"
)
^
0
)
209 210
local
b_stripper
=
Cs
(
spacer
^
0
/
"
"
*
(
nonspacer
^
1
+
spacer
^
1
/
"
"
)
^
0
)
211
local
m_stripper
=
Cs
(
(
nonspacer
^
1
+
spacer
^
1
/
"
"
)
^
0
)
212
local
e_stripper
=
Cs
(
(
spacer
^
1
*
endofstring
/
"
"
+
nonspacer
^
1
+
spacer
^
1
/
"
"
)
^
0
)
213
local
x_stripper
=
Cs
(
(
nonspacer
^
1
+
spacer
^
1
/
"
"
)
^
0
)
214 215
patterns
.
stripper
=
stripper
216
patterns
.
fullstripper
=
fullstripper
217
patterns
.
collapser
=
collapser
218
patterns
.
nospacer
=
nospacer
219 220
patterns
.
b_collapser
=
b_collapser
221
patterns
.
m_collapser
=
m_collapser
222
patterns
.
e_collapser
=
e_collapser
223
patterns
.
x_collapser
=
x_collapser
224 225
patterns
.
b_stripper
=
b_stripper
226
patterns
.
m_stripper
=
m_stripper
227
patterns
.
e_stripper
=
e_stripper
228
patterns
.
x_stripper
=
x_stripper
229 230
patterns
.
lowercase
=
lowercase
231
patterns
.
uppercase
=
uppercase
232
patterns
.
letter
=
patterns
.
lowercase
+
patterns
.
uppercase
233
patterns
.
space
=
space
234
patterns
.
tab
=
P
(
"
\t
"
)
235
patterns
.
spaceortab
=
patterns
.
space
+
patterns
.
tab
236
patterns
.
newline
=
newline
237
patterns
.
emptyline
=
newline
^
1
238
patterns
.
equal
=
P
(
"
=
"
)
239
patterns
.
comma
=
comma
240
patterns
.
commaspacer
=
comma
*
spacer
^
0
241
patterns
.
period
=
period
242
patterns
.
colon
=
P
(
"
:
"
)
243
patterns
.
semicolon
=
P
(
"
;
"
)
244
patterns
.
underscore
=
underscore
245
patterns
.
escaped
=
escaped
246
patterns
.
squote
=
squote
247
patterns
.
dquote
=
dquote
248
patterns
.
nosquote
=
(
escaped
+
(
1
-
squote
)
)
^
0
249
patterns
.
nodquote
=
(
escaped
+
(
1
-
dquote
)
)
^
0
250
patterns
.
unsingle
=
(
squote
/
"
"
)
*
patterns
.
nosquote
*
(
squote
/
"
"
)
-- will change to C in the middle
251
patterns
.
undouble
=
(
dquote
/
"
"
)
*
patterns
.
nodquote
*
(
dquote
/
"
"
)
-- will change to C in the middle
252
patterns
.
unquoted
=
patterns
.
undouble
+
patterns
.
unsingle
-- more often undouble
253
patterns
.
unspacer
=
(
(
patterns
.
spacer
^
1
)
/
"
"
)
^
0
254 255
patterns
.
singlequoted
=
squote
*
patterns
.
nosquote
*
squote
256
patterns
.
doublequoted
=
dquote
*
patterns
.
nodquote
*
dquote
257
patterns
.
quoted
=
patterns
.
doublequoted
+
patterns
.
singlequoted
258 259
patterns
.
digit
=
digit
260
patterns
.
digits
=
digits
261
patterns
.
octdigit
=
octdigit
262
patterns
.
octdigits
=
octdigits
263
patterns
.
hexdigit
=
hexdigit
264
patterns
.
hexdigits
=
hexdigits
265
patterns
.
sign
=
sign
266
patterns
.
cardinal
=
digits
267
patterns
.
integer
=
sign
^
-1
*
digits
268
patterns
.
unsigned
=
digit
^
0
*
period
*
digits
269
patterns
.
float
=
sign
^
-1
*
patterns
.
unsigned
270
patterns
.
cunsigned
=
digit
^
0
*
comma
*
digits
271
patterns
.
cpunsigned
=
digit
^
0
*
(
period
+
comma
)
*
digits
272
patterns
.
cfloat
=
sign
^
-1
*
patterns
.
cunsigned
273
patterns
.
cpfloat
=
sign
^
-1
*
patterns
.
cpunsigned
274
patterns
.
number
=
patterns
.
float
+
patterns
.
integer
275
patterns
.
cnumber
=
patterns
.
cfloat
+
patterns
.
integer
276
patterns
.
cpnumber
=
patterns
.
cpfloat
+
patterns
.
integer
277
patterns
.
oct
=
zero
*
octdigits
-- hm is this ok
278
patterns
.
octal
=
patterns
.
oct
279
patterns
.
HEX
=
zero
*
P
(
"
X
"
)
*
(
digit
+
uppercase
)
^
1
280
patterns
.
hex
=
zero
*
P
(
"
x
"
)
*
(
digit
+
lowercase
)
^
1
281
patterns
.
hexadecimal
=
zero
*
S
(
"
xX
"
)
*
hexdigits
282 283
patterns
.
hexafloat
=
sign
^
-1
284
*
zero
*
S
(
"
xX
"
)
285
*
(
hexdigit
^
0
*
period
*
hexdigits
+
hexdigits
*
period
*
hexdigit
^
0
+
hexdigits
)
286
*
(
S
(
"
pP
"
)
*
sign
^
-1
*
hexdigits
)
^
-1
287
patterns
.
decafloat
=
sign
^
-1
288
*
(
digit
^
0
*
period
*
digits
+
digits
*
period
*
digit
^
0
+
digits
)
289
*
S
(
"
eE
"
)
*
sign
^
-1
*
digits
290 291
patterns
.
propername
=
(
uppercase
+
lowercase
+
underscore
)
*
(
uppercase
+
lowercase
+
underscore
+
digit
)
^
0
*
endofstring
292 293
patterns
.
somecontent
=
(
anything
-
newline
-
space
)
^
1
-- (utf8char - newline - space)^1
294
patterns
.
beginline
=
#
(
1
-
newline
)
295 296
patterns
.
longtostring
=
Cs
(
whitespace
^
0
/
"
"
*
(
(
patterns
.
quoted
+
nonwhitespace
^
1
+
whitespace
^
1
/
"
"
*
(
endofstring
+
Cc
(
"
"
)
)
)
^
0
)
)
297 298
-- local function anywhere(pattern) -- slightly adapted from website
299
-- return P { P(pattern) + 1 * V(1) }
300
-- end
301 302
local
function
anywhere
(
pattern
)
-- faster
303
return
(
1
-
P
(
pattern
)
)
^
0
*
P
(
pattern
)
304
end
305 306
lpeg
.
anywhere
=
anywhere
307 308
function
lpeg
.
instringchecker
(
p
)
309
p
=
anywhere
(
p
)
310
return
function
(
str
)
311
return
lpegmatch
(
p
,
str
)
and
true
or
false
312
end
313
end
314 315
-- function lpeg.splitter(pattern, action)
316
-- return (((1-P(pattern))^1)/action+1)^0
317
-- end
318 319
-- function lpeg.tsplitter(pattern, action)
320
-- return Ct((((1-P(pattern))^1)/action+1)^0)
321
-- end
322 323
function
lpeg
.
splitter
(
pattern
,
action
)
324
if
action
then
325
return
(
(
(
1
-
P
(
pattern
)
)
^
1
)
/
action
+
1
)
^
0
326
else
327
return
(
Cs
(
(
1
-
P
(
pattern
)
)
^
1
)
+
1
)
^
0
328
end
329
end
330 331
function
lpeg
.
tsplitter
(
pattern
,
action
)
332
if
action
then
333
return
Ct
(
(
(
(
1
-
P
(
pattern
)
)
^
1
)
/
action
+
1
)
^
0
)
334
else
335
return
Ct
(
(
Cs
(
(
1
-
P
(
pattern
)
)
^
1
)
+
1
)
^
0
)
336
end
337
end
338 339
-- probleem: separator can be lpeg and that does not hash too well, but
340
-- it's quite okay as the key is then not garbage collected
341 342
local
splitters_s
,
splitters_m
,
splitters_t
=
{
}
,
{
}
,
{
}
343 344
local
function
splitat
(
separator
,
single
)
345
local
splitter
=
(
single
and
splitters_s
[
separator
]
)
or
splitters_m
[
separator
]
346
if
not
splitter
then
347
separator
=
P
(
separator
)
348
local
other
=
C
(
(
1
-
separator
)
^
0
)
349
if
single
then
350
local
any
=
anything
351
splitter
=
other
*
(
separator
*
C
(
any
^
0
)
+
"
"
)
-- ?
352
splitters_s
[
separator
]
=
splitter
353
else
354
splitter
=
other
*
(
separator
*
other
)
^
0
355
splitters_m
[
separator
]
=
splitter
356
end
357
end
358
return
splitter
359
end
360 361
local
function
tsplitat
(
separator
)
362
local
splitter
=
splitters_t
[
separator
]
363
if
not
splitter
then
364
splitter
=
Ct
(
splitat
(
separator
)
)
365
splitters_t
[
separator
]
=
splitter
366
end
367
return
splitter
368
end
369 370
lpeg
.
splitat
=
splitat
371
lpeg
.
tsplitat
=
tsplitat
372 373
function
string
.
splitup
(
str
,
separator
)
374
if
not
separator
then
375
separator
=
"
,
"
376
end
377
return
lpegmatch
(
splitters_m
[
separator
]
or
splitat
(
separator
)
,
str
)
378
end
379 380
-- local p = splitat("->",false) print(lpegmatch(p,"oeps->what->more")) -- oeps what more
381
-- local p = splitat("->",true) print(lpegmatch(p,"oeps->what->more")) -- oeps what->more
382
-- local p = splitat("->",false) print(lpegmatch(p,"oeps")) -- oeps
383
-- local p = splitat("->",true) print(lpegmatch(p,"oeps")) -- oeps
384 385
local
cache
=
{
}
386 387
function
lpeg
.
split
(
separator
,
str
)
388
local
c
=
cache
[
separator
]
389
if
not
c
then
390
c
=
tsplitat
(
separator
)
391
cache
[
separator
]
=
c
392
end
393
return
lpegmatch
(
c
,
str
)
394
end
395 396
function
string
.
split
(
str
,
separator
)
397
if
separator
then
398
local
c
=
cache
[
separator
]
399
if
not
c
then
400
c
=
tsplitat
(
separator
)
401
cache
[
separator
]
=
c
402
end
403
return
lpegmatch
(
c
,
str
)
404
else
405
return
{
str
}
406
end
407
end
408 409
local
spacing
=
patterns
.
spacer
^
0
*
newline
-- sort of strip
410
local
empty
=
spacing
*
Cc
(
"
"
)
411
local
nonempty
=
Cs
(
(
1
-
spacing
)
^
1
)
*
spacing
^
-1
412
local
content
=
(
empty
+
nonempty
)
^
1
413 414
patterns
.
textline
=
content
415 416
local
linesplitter
=
tsplitat
(
newline
)
417 418
patterns
.
linesplitter
=
linesplitter
419 420
function
string
.
splitlines
(
str
)
421
return
lpegmatch
(
linesplitter
,
str
)
422
end
423 424
-- lpeg.splitters = cache -- no longer public
425 426
local
cache
=
{
}
427 428
function
lpeg
.
checkedsplit
(
separator
,
str
)
429
local
c
=
cache
[
separator
]
430
if
not
c
then
431
separator
=
P
(
separator
)
432
local
other
=
C
(
(
1
-
separator
)
^
1
)
433
c
=
Ct
(
separator
^
0
*
other
*
(
separator
^
1
*
other
)
^
0
)
434
cache
[
separator
]
=
c
435
end
436
return
lpegmatch
(
c
,
str
)
437
end
438 439
function
string
.
checkedsplit
(
str
,
separator
)
440
local
c
=
cache
[
separator
]
441
if
not
c
then
442
separator
=
P
(
separator
)
443
local
other
=
C
(
(
1
-
separator
)
^
1
)
444
c
=
Ct
(
separator
^
0
*
other
*
(
separator
^
1
*
other
)
^
0
)
445
cache
[
separator
]
=
c
446
end
447
return
lpegmatch
(
c
,
str
)
448
end
449 450
-- from roberto's site:
451 452
local
function
f2
(
s
)
local
c1
,
c2
=
byte
(
s
,
1
,
2
)
return
c1
*
64
+
c2
-
12416
end
453
local
function
f3
(
s
)
local
c1
,
c2
,
c3
=
byte
(
s
,
1
,
3
)
return
(
c1
*
64
+
c2
)
*
64
+
c3
-
925824
end
454
local
function
f4
(
s
)
local
c1
,
c2
,
c3
,
c4
=
byte
(
s
,
1
,
4
)
return
(
(
c1
*
64
+
c2
)
*
64
+
c3
)
*
64
+
c4
-
63447168
end
455 456
local
utf8byte
=
patterns
.
utf8one
/
byte
+
patterns
.
utf8two
/
f2
+
patterns
.
utf8three
/
f3
+
patterns
.
utf8four
/
f4
457 458
patterns
.
utf8byte
=
utf8byte
459 460
--~ local str = " a b c d "
461 462
--~ local s = lpeg.stripper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
463
--~ local s = lpeg.keeper(lpeg.R("az")) print("["..lpegmatch(s,str).."]")
464
--~ local s = lpeg.stripper("ab") print("["..lpegmatch(s,str).."]")
465
--~ local s = lpeg.keeper("ab") print("["..lpegmatch(s,str).."]")
466 467
local
cache
=
{
}
468 469
function
lpeg
.
stripper
(
str
)
470
if
type
(
str
)
=
=
"
string
"
then
471
local
s
=
cache
[
str
]
472
if
not
s
then
473
s
=
Cs
(
(
(
S
(
str
)
^
1
)
/
"
"
+
1
)
^
0
)
474
cache
[
str
]
=
s
475
end
476
return
s
477
else
478
return
Cs
(
(
(
str
^
1
)
/
"
"
+
1
)
^
0
)
479
end
480
end
481 482
local
cache
=
{
}
483 484
function
lpeg
.
keeper
(
str
)
485
if
type
(
str
)
=
=
"
string
"
then
486
local
s
=
cache
[
str
]
487
if
not
s
then
488
s
=
Cs
(
(
(
(
1
-
S
(
str
)
)
^
1
)
/
"
"
+
1
)
^
0
)
489
cache
[
str
]
=
s
490
end
491
return
s
492
else
493
return
Cs
(
(
(
(
1
-
str
)
^
1
)
/
"
"
+
1
)
^
0
)
494
end
495
end
496 497
function
lpeg
.
frontstripper
(
str
)
-- or pattern (yet undocumented)
498
return
(
P
(
str
)
+
P
(
true
)
)
*
Cs
(
anything
^
0
)
499
end
500 501
function
lpeg
.
endstripper
(
str
)
-- or pattern (yet undocumented)
502
return
Cs
(
(
1
-
P
(
str
)
*
endofstring
)
^
0
)
503
end
504 505
-- Just for fun I looked at the used bytecode and
506
-- p = (p and p + pp) or pp gets one more (testset).
507 508
-- todo: cache when string
509 510
function
lpeg
.
replacer
(
one
,
two
,
makefunction
,
isutf
)
-- in principle we should sort the keys but we have a better one anyway
511
local
pattern
512
local
u
=
isutf
and
utf8char
or
1
513
if
type
(
one
)
=
=
"
table
"
then
514
local
no
=
#
one
515
local
p
=
P
(
false
)
516
if
no
=
=
0
then
517
for
k
,
v
in
next
,
one
do
518
p
=
p
+
P
(
k
)
/
v
519
end
520
pattern
=
Cs
(
(
p
+
u
)
^
0
)
521
elseif
no
=
=
1
then
522
local
o
=
one
[
1
]
523
one
,
two
=
P
(
o
[
1
]
)
,
o
[
2
]
524
-- pattern = Cs(((1-one)^1 + one/two)^0)
525
pattern
=
Cs
(
(
one
/
two
+
u
)
^
0
)
526
else
527
for
i
=
1
,
no
do
528
local
o
=
one
[
i
]
529
p
=
p
+
P
(
o
[
1
]
)
/
o
[
2
]
530
end
531
pattern
=
Cs
(
(
p
+
u
)
^
0
)
532
end
533
else
534
pattern
=
Cs
(
(
P
(
one
)
/
(
two
or
"
"
)
+
u
)
^
0
)
535
end
536
if
makefunction
then
537
return
function
(
str
)
538
return
lpegmatch
(
pattern
,
str
)
539
end
540
else
541
return
pattern
542
end
543
end
544 545
-- local pattern1 = P(1-P(pattern))^0 * P(pattern) : test for not nil
546
-- local pattern2 = (P(pattern) * Cc(true) + P(1))^0 : test for true (could be faster, but not much)
547 548
function
lpeg
.
finder
(
lst
,
makefunction
,
isutf
)
-- beware: slower than find with 'patternless finds'
549
local
pattern
550
if
type
(
lst
)
=
=
"
table
"
then
551
pattern
=
P
(
false
)
552
if
#
lst
=
=
0
then
553
for
k
,
v
in
next
,
lst
do
554
pattern
=
pattern
+
P
(
k
)
-- ignore key, so we can use a replacer table
555
end
556
else
557
for
i
=
1
,
#
lst
do
558
pattern
=
pattern
+
P
(
lst
[
i
]
)
559
end
560
end
561
else
562
pattern
=
P
(
lst
)
563
end
564
if
isutf
then
565
pattern
=
(
(
utf8char
or
1
)
-
pattern
)
^
0
*
pattern
566
else
567
pattern
=
(
1
-
pattern
)
^
0
*
pattern
568
end
569
if
makefunction
then
570
return
function
(
str
)
571
return
lpegmatch
(
pattern
,
str
)
572
end
573
else
574
return
pattern
575
end
576
end
577 578
-- print(lpeg.match(lpeg.replacer("e","a"),"test test"))
579
-- print(lpeg.match(lpeg.replacer{{"e","a"}},"test test"))
580
-- print(lpeg.match(lpeg.replacer({ e = "a", t = "x" }),"test test"))
581 582
local
splitters_f
,
splitters_s
=
{
}
,
{
}
583 584
function
lpeg
.
firstofsplit
(
separator
)
-- always return value
585
local
splitter
=
splitters_f
[
separator
]
586
if
not
splitter
then
587
local
pattern
=
P
(
separator
)
588
splitter
=
C
(
(
1
-
pattern
)
^
0
)
589
splitters_f
[
separator
]
=
splitter
590
end
591
return
splitter
592
end
593 594
function
lpeg
.
secondofsplit
(
separator
)
-- nil if not split
595
local
splitter
=
splitters_s
[
separator
]
596
if
not
splitter
then
597
local
pattern
=
P
(
separator
)
598
splitter
=
(
1
-
pattern
)
^
0
*
pattern
*
C
(
anything
^
0
)
599
splitters_s
[
separator
]
=
splitter
600
end
601
return
splitter
602
end
603 604
local
splitters_s
,
splitters_p
=
{
}
,
{
}
605 606
function
lpeg
.
beforesuffix
(
separator
)
-- nil if nothing but empty is ok
607
local
splitter
=
splitters_s
[
separator
]
608
if
not
splitter
then
609
local
pattern
=
P
(
separator
)
610
splitter
=
C
(
(
1
-
pattern
)
^
0
)
*
pattern
*
endofstring
611
splitters_s
[
separator
]
=
splitter
612
end
613
return
splitter
614
end
615 616
function
lpeg
.
afterprefix
(
separator
)
-- nil if nothing but empty is ok
617
local
splitter
=
splitters_p
[
separator
]
618
if
not
splitter
then
619
local
pattern
=
P
(
separator
)
620
splitter
=
pattern
*
C
(
anything
^
0
)
621
splitters_p
[
separator
]
=
splitter
622
end
623
return
splitter
624
end
625 626
function
lpeg
.
balancer
(
left
,
right
)
627
left
,
right
=
P
(
left
)
,
P
(
right
)
628
return
P
{
left
*
(
(
1
-
left
-
right
)
+
V
(
1
)
)
^
0
*
right
}
629
end
630 631
-- print(1,lpegmatch(lpeg.firstofsplit(":"),"bc:de"))
632
-- print(2,lpegmatch(lpeg.firstofsplit(":"),":de")) -- empty
633
-- print(3,lpegmatch(lpeg.firstofsplit(":"),"bc"))
634
-- print(4,lpegmatch(lpeg.secondofsplit(":"),"bc:de"))
635
-- print(5,lpegmatch(lpeg.secondofsplit(":"),"bc:")) -- empty
636
-- print(6,lpegmatch(lpeg.secondofsplit(":",""),"bc"))
637
-- print(7,lpegmatch(lpeg.secondofsplit(":"),"bc"))
638
-- print(9,lpegmatch(lpeg.secondofsplit(":","123"),"bc"))
639 640
-- this was slower but lpeg has been sped up in the meantime, so we no longer
641
-- use this (still seems somewhat faster on long strings)
642
--
643
-- local nany = utf8char/""
644
--
645
-- function lpeg.counter(pattern)
646
-- pattern = Cs((P(pattern)/" " + nany)^0)
647
-- return function(str)
648
-- return #lpegmatch(pattern,str)
649
-- end
650
-- end
651 652
function
lpeg
.
counter
(
pattern
,
action
)
653
local
n
=
0
654
local
pattern
=
(
P
(
pattern
)
/
function
(
)
n
=
n
+
1
end
+
anything
)
^
0
655
----- pattern = (P(pattern) * (P(true) / function() n = n + 1 end) + anything)^0
656
----- pattern = (P(pattern) * P(function() n = n + 1 end) + anything)^0
657
if
action
then
658
return
function
(
str
)
n
=
0
;
lpegmatch
(
pattern
,
str
)
;
action
(
n
)
end
659
else
660
return
function
(
str
)
n
=
0
;
lpegmatch
(
pattern
,
str
)
;
return
n
end
661
end
662
end
663 664
-- lpeg.print(lpeg.R("ab","cd","gh"))
665
-- lpeg.print(lpeg.P("a","b","c"))
666
-- lpeg.print(lpeg.S("a","b","c"))
667 668
-- print(lpeg.counter(lpeg.P("á") + lpeg.P("à"))("äáàa"))
669
-- print(lpeg.counter(lpeg.UP("áà"))("äáàa"))
670
-- print(lpeg.counter(lpeg.US("àá"))("äáàa"))
671
-- print(lpeg.counter(lpeg.UR("aá"))("äáàa"))
672
-- print(lpeg.counter(lpeg.UR("àá"))("äáàa"))
673
-- print(lpeg.counter(lpeg.UR(0x0000,0xFFFF)))
674 675
function
lpeg
.
is_lpeg
(
p
)
676
return
p
and
lpegtype
(
p
)
=
=
"
pattern
"
677
end
678 679
function
lpeg
.
oneof
(
list
,
...
)
-- lpeg.oneof("elseif","else","if","then") -- assume proper order
680
if
type
(
list
)
~
=
"
table
"
then
681
list
=
{
list
,
...
}
682
end
683
-- table.sort(list) -- longest match first
684
local
p
=
P
(
list
[
1
]
)
685
for
l
=
2
,
#
list
do
686
p
=
p
+
P
(
list
[
l
]
)
687
end
688
return
p
689
end
690 691
-- For the moment here, but it might move to utilities. Beware, we need to
692
-- have the longest keyword first, so 'aaa' comes beforte 'aa' which is why we
693
-- loop back from the end cq. prepend.
694 695
local
sort
=
table
.
sort
696 697
local
function
copyindexed
(
old
)
698
local
new
=
{
}
699
for
i
=
1
,
#
old
do
700
new
[
i
]
=
old
701
end
702
return
new
703
end
704 705
local
function
sortedkeys
(
tab
)
706
local
keys
,
s
=
{
}
,
0
707
for
key
,
_
in
next
,
tab
do
708
s
=
s
+
1
709
keys
[
s
]
=
key
710
end
711
sort
(
keys
)
712
return
keys
713
end
714 715
function
lpeg
.
append
(
list
,
pp
,
delayed
,
checked
)
716
local
p
=
pp
717
if
#
list
>
0
then
718
local
keys
=
copyindexed
(
list
)
719
sort
(
keys
)
720
for
i
=
#
keys
,
1
,
-1
do
721
local
k
=
keys
[
i
]
722
if
p
then
723
p
=
P
(
k
)
+
p
724
else
725
p
=
P
(
k
)
726
end
727
end
728
elseif
delayed
then
-- hm, it looks like the lpeg parser resolves anyway
729
local
keys
=
sortedkeys
(
list
)
730
if
p
then
731
for
i
=
1
,
#
keys
,
1
do
732
local
k
=
keys
[
i
]
733
local
v
=
list
[
k
]
734
p
=
P
(
k
)
/
list
+
p
735
end
736
else
737
for
i
=
1
,
#
keys
do
738
local
k
=
keys
[
i
]
739
local
v
=
list
[
k
]
740
if
p
then
741
p
=
P
(
k
)
+
p
742
else
743
p
=
P
(
k
)
744
end
745
end
746
if
p
then
747
p
=
p
/
list
748
end
749
end
750
elseif
checked
then
751
-- problem: substitution gives a capture
752
local
keys
=
sortedkeys
(
list
)
753
for
i
=
1
,
#
keys
do
754
local
k
=
keys
[
i
]
755
local
v
=
list
[
k
]
756
if
p
then
757
if
k
=
=
v
then
758
p
=
P
(
k
)
+
p
759
else
760
p
=
P
(
k
)
/
v
+
p
761
end
762
else
763
if
k
=
=
v
then
764
p
=
P
(
k
)
765
else
766
p
=
P
(
k
)
/
v
767
end
768
end
769
end
770
else
771
local
keys
=
sortedkeys
(
list
)
772
for
i
=
1
,
#
keys
do
773
local
k
=
keys
[
i
]
774
local
v
=
list
[
k
]
775
if
p
then
776
p
=
P
(
k
)
/
v
+
p
777
else
778
p
=
P
(
k
)
/
v
779
end
780
end
781
end
782
return
p
783
end
784 785
-- inspect(lpeg.append({ a = "1", aa = "1", aaa = "1" } ,nil,true))
786
-- inspect(lpeg.append({ ["degree celsius"] = "1", celsius = "1", degree = "1" } ,nil,true))
787 788
-- function lpeg.exact_match(words,case_insensitive)
789
-- local pattern = concat(words)
790
-- if case_insensitive then
791
-- local pattern = S(upper(characters)) + S(lower(characters))
792
-- local list = { }
793
-- for i=1,#words do
794
-- list[lower(words[i])] = true
795
-- end
796
-- return Cmt(pattern^1, function(_,i,s)
797
-- return list[lower(s)] and i
798
-- end)
799
-- else
800
-- local pattern = S(concat(words))
801
-- local list = { }
802
-- for i=1,#words do
803
-- list[words[i]] = true
804
-- end
805
-- return Cmt(pattern^1, function(_,i,s)
806
-- return list[s] and i
807
-- end)
808
-- end
809
-- end
810 811
-- experiment:
812 813
local
p_false
=
P
(
false
)
814
local
p_true
=
P
(
true
)
815 816
-- local function collapse(t,x)
817
-- if type(t) ~= "table" then
818
-- return t, x
819
-- else
820
-- local n = next(t)
821
-- if n == nil then
822
-- return t, x
823
-- elseif next(t,n) == nil then
824
-- -- one entry
825
-- local k = n
826
-- local v = t[k]
827
-- if type(v) == "table" then
828
-- return collapse(v,x..k)
829
-- else
830
-- return v, x .. k
831
-- end
832
-- else
833
-- local tt = { }
834
-- for k, v in next, t do
835
-- local vv, kk = collapse(v,k)
836
-- tt[kk] = vv
837
-- end
838
-- return tt, x
839
-- end
840
-- end
841
-- end
842 843
local
lower
=
utf
and
utf
.
lower
or
string
.
lower
844
local
upper
=
utf
and
utf
.
upper
or
string
.
upper
845 846
function
lpeg
.
setutfcasers
(
l
,
u
)
847
lower
=
l
or
lower
848
upper
=
u
or
upper
849
end
850 851
local
function
make1
(
t
,
rest
)
852
local
p
=
p_false
853
local
keys
=
sortedkeys
(
t
)
854
for
i
=
1
,
#
keys
do
855
local
k
=
keys
[
i
]
856
if
k
~
=
"
"
then
857
local
v
=
t
[
k
]
858
if
v
=
=
true
then
859
p
=
p
+
P
(
k
)
*
p_true
860
elseif
v
=
=
false
then
861
-- can't happen
862
else
863
p
=
p
+
P
(
k
)
*
make1
(
v
,
v
[
"
"
]
)
864
end
865
end
866
end
867
if
rest
then
868
p
=
p
+
p_true
869
end
870
return
p
871
end
872 873
local
function
make2
(
t
,
rest
)
-- only ascii
874
local
p
=
p_false
875
local
keys
=
sortedkeys
(
t
)
876
for
i
=
1
,
#
keys
do
877
local
k
=
keys
[
i
]
878
if
k
~
=
"
"
then
879
local
v
=
t
[
k
]
880
if
v
=
=
true
then
881
p
=
p
+
(
P
(
lower
(
k
)
)
+
P
(
upper
(
k
)
)
)
*
p_true
882
elseif
v
=
=
false
then
883
-- can't happen
884
else
885
p
=
p
+
(
P
(
lower
(
k
)
)
+
P
(
upper
(
k
)
)
)
*
make2
(
v
,
v
[
"
"
]
)
886
end
887
end
888
end
889
if
rest
then
890
p
=
p
+
p_true
891
end
892
return
p
893
end
894 895
local
function
utfchartabletopattern
(
list
,
insensitive
)
-- goes to util-lpg
896
local
tree
=
{
}
897
local
n
=
#
list
898
if
n
=
=
0
then
899
for
s
in
next
,
list
do
900
local
t
=
tree
901
local
p
,
pk
902
for
c
in
gmatch
(
s
,
"
.
"
)
do
903
if
t
=
=
true
then
904
t
=
{
[
c
]
=
true
,
[
"
"
]
=
true
}
905
p
[
pk
]
=
t
906
p
=
t
907
t
=
false
908
elseif
t
=
=
false
then
909
t
=
{
[
c
]
=
false
}
910
p
[
pk
]
=
t
911
p
=
t
912
t
=
false
913
else
914
local
tc
=
t
[
c
]
915
if
not
tc
then
916
tc
=
false
917
t
[
c
]
=
false
918
end
919
p
=
t
920
t
=
tc
921
end
922
pk
=
c
923
end
924
if
t
=
=
false
then
925
p
[
pk
]
=
true
926
elseif
t
=
=
true
then
927
-- okay
928
else
929
t
[
"
"
]
=
true
930
end
931
end
932
else
933
for
i
=
1
,
n
do
934
local
s
=
list
[
i
]
935
local
t
=
tree
936
local
p
,
pk
937
for
c
in
gmatch
(
s
,
"
.
"
)
do
938
if
t
=
=
true
then
939
t
=
{
[
c
]
=
true
,
[
"
"
]
=
true
}
940
p
[
pk
]
=
t
941
p
=
t
942
t
=
false
943
elseif
t
=
=
false
then
944
t
=
{
[
c
]
=
false
}
945
p
[
pk
]
=
t
946
p
=
t
947
t
=
false
948
else
949
local
tc
=
t
[
c
]
950
if
not
tc
then
951
tc
=
false
952
t
[
c
]
=
false
953
end
954
p
=
t
955
t
=
tc
956
end
957
pk
=
c
958
end
959
if
t
=
=
false
then
960
p
[
pk
]
=
true
961
elseif
t
=
=
true
then
962
-- okay
963
else
964
t
[
"
"
]
=
true
965
end
966
end
967
end
968
-- collapse(tree,"") -- needs testing, maybe optional, slightly faster because P("x")*P("X") seems slower than P"(xX") (why)
969
-- inspect(tree)
970
return
(
insensitive
and
make2
or
make1
)
(
tree
)
971
end
972 973
lpeg
.
utfchartabletopattern
=
utfchartabletopattern
974 975
function
lpeg
.
utfreplacer
(
list
,
insensitive
)
976
local
pattern
=
Cs
(
(
utfchartabletopattern
(
list
,
insensitive
)
/
list
+
utf8character
)
^
0
)
977
return
function
(
str
)
978
return
lpegmatch
(
pattern
,
str
)
or
str
979
end
980
end
981 982
-- local t = { "start", "stoep", "staart", "paard" }
983
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
984 985
-- local t = { "a", "abc", "ac", "abe", "abxyz", "xy", "bef","aa" }
986
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/string.upper + 1)^1)
987 988
-- inspect(lpegmatch(p,"a")=="A")
989
-- inspect(lpegmatch(p,"aa")=="AA")
990
-- inspect(lpegmatch(p,"aaaa")=="AAAA")
991
-- inspect(lpegmatch(p,"ac")=="AC")
992
-- inspect(lpegmatch(p,"bc")=="bc")
993
-- inspect(lpegmatch(p,"zzbczz")=="zzbczz")
994
-- inspect(lpegmatch(p,"zzabezz")=="zzABEzz")
995
-- inspect(lpegmatch(p,"ab")=="Ab")
996
-- inspect(lpegmatch(p,"abc")=="ABC")
997
-- inspect(lpegmatch(p,"abe")=="ABE")
998
-- inspect(lpegmatch(p,"xa")=="xA")
999
-- inspect(lpegmatch(p,"bx")=="bx")
1000
-- inspect(lpegmatch(p,"bax")=="bAx")
1001
-- inspect(lpegmatch(p,"abxyz")=="ABXYZ")
1002
-- inspect(lpegmatch(p,"foobarbefcrap")=="foobArBEFcrAp")
1003 1004
-- local t = { ["^"] = 1, ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
1005
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
1006
-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
1007 1008
-- local t = { ["^^"] = 2, ["^^^"] = 3, ["^^^^"] = 4 }
1009
-- local p = lpeg.Cs((lpeg.utfchartabletopattern(t)/t + 1)^1)
1010
-- inspect(lpegmatch(p," ^ ^^ ^^^ ^^^^ ^^^^^ ^^^^^^ ^^^^^^^ "))
1011 1012
-- lpeg.utfchartabletopattern {
1013
-- utfchar(0x00A0), -- nbsp
1014
-- utfchar(0x2000), -- enquad
1015
-- utfchar(0x2001), -- emquad
1016
-- utfchar(0x2002), -- enspace
1017
-- utfchar(0x2003), -- emspace
1018
-- utfchar(0x2004), -- threeperemspace
1019
-- utfchar(0x2005), -- fourperemspace
1020
-- utfchar(0x2006), -- sixperemspace
1021
-- utfchar(0x2007), -- figurespace
1022
-- utfchar(0x2008), -- punctuationspace
1023
-- utfchar(0x2009), -- breakablethinspace
1024
-- utfchar(0x200A), -- hairspace
1025
-- utfchar(0x200B), -- zerowidthspace
1026
-- utfchar(0x202F), -- narrownobreakspace
1027
-- utfchar(0x205F), -- math thinspace
1028
-- }
1029 1030
-- a few handy ones:
1031
--
1032
-- faster than find(str,"[\n\r]") when match and # > 7 and always faster when # > 3
1033 1034
patterns
.
containseol
=
lpeg
.
finder
(
eol
)
-- (1-eol)^0 * eol
1035 1036
-- The next pattern^n variant is based on an approach suggested
1037
-- by Roberto: constructing a big repetition in chunks.
1038
--
1039
-- Being sparse is not needed, and only complicate matters and
1040
-- the number of redundant entries is not that large.
1041 1042
local
function
nextstep
(
n
,
step
,
result
)
1043
local
m
=
n
%
step
-- mod(n,step)
1044
local
d
=
floor
(
n
/
step
)
-- div(n,step)
1045
if
d
>
0
then
1046
local
v
=
V
(
tostring
(
step
)
)
1047
local
s
=
result
.
start
1048
for
i
=
1
,
d
do
1049
if
s
then
1050
s
=
v
*
s
1051
else
1052
s
=
v
1053
end
1054
end
1055
result
.
start
=
s
1056
end
1057
if
step
>
1
and
result
.
start
then
1058
local
v
=
V
(
tostring
(
step
/
2
)
)
1059
result
[
tostring
(
step
)
]
=
v
*
v
1060
end
1061
if
step
>
0
then
1062
return
nextstep
(
m
,
step
/
2
,
result
)
1063
else
1064
return
result
1065
end
1066
end
1067 1068
function
lpeg
.
times
(
pattern
,
n
)
1069
return
P
(
nextstep
(
n
,
2
^
16
,
{
"
start
"
,
[
"
1
"
]
=
pattern
}
)
)
1070
end
1071 1072
-- local p = lpeg.Cs((1 - lpeg.times(lpeg.P("AB"),25))^1)
1073
-- local s = "12" .. string.rep("AB",20) .. "34" .. string.rep("AB",30) .. "56"
1074
-- inspect(p)
1075
-- print(lpeg.match(p,s))
1076 1077
-- moved here (before util-str)
1078 1079
do
1080 1081
local
trailingzeros
=
zero
^
0
*
-
digit
-- suggested by Roberto
1082
local
stripper
=
Cs
(
(
1083
digits
*
(
1084
period
*
trailingzeros
/
"
"
1085
+
period
*
(
digit
-
trailingzeros
)
^
1
*
(
trailingzeros
/
"
"
)
1086
)
+
1
1087
)
^
0
)
1088 1089
lpeg
.
patterns
.
stripzeros
=
stripper
-- multiple in string
1090 1091
local
nonzero
=
digit
-
zero
1092
local
trailingzeros
=
zero
^
1
*
endofstring
1093
local
stripper
=
Cs
(
(
1
-
period
)
^
0
*
(
1094
period
*
trailingzeros
/
"
"
1095
+
period
*
(
nonzero
^
1
+
(
trailingzeros
/
"
"
)
+
zero
^
1
)
^
0
1096
+
endofstring
1097
)
)
1098 1099
lpeg
.
patterns
.
stripzero
=
stripper
-- slightly more efficient but expects a float !
1100 1101
-- local sample = "bla 11.00 bla 11 bla 0.1100 bla 1.00100 bla 0.00 bla 0.001 bla 1.1100 bla 0.100100100 bla 0.00100100100"
1102
-- collectgarbage("collect")
1103
-- str = string.rep(sample,10000)
1104
-- local ts = os.clock()
1105
-- lpegmatch(stripper,str)
1106
-- print(#str, os.clock()-ts, lpegmatch(stripper,sample))
1107 1108
end
1109 1110
-- for practical reasons we keep this here:
1111 1112
local
byte_to_HEX
=
{
}
1113
local
byte_to_hex
=
{
}
1114
local
byte_to_dec
=
{
}
-- for md5
1115
local
hex_to_byte
=
{
}
1116 1117
for
i
=
0
,
255
do
1118
local
H
=
format
(
"
%02X
"
,
i
)
1119
local
h
=
format
(
"
%02x
"
,
i
)
1120
local
d
=
format
(
"
%03i
"
,
i
)
1121
local
c
=
char
(
i
)
1122
byte_to_HEX
[
c
]
=
H
1123
byte_to_hex
[
c
]
=
h
1124
byte_to_dec
[
c
]
=
d
1125
hex_to_byte
[
h
]
=
c
1126
hex_to_byte
[
H
]
=
c
1127
end
1128 1129
local
hextobyte
=
P
(
2
)
/
hex_to_byte
1130
local
bytetoHEX
=
P
(
1
)
/
byte_to_HEX
1131
local
bytetohex
=
P
(
1
)
/
byte_to_hex
1132
local
bytetodec
=
P
(
1
)
/
byte_to_dec
1133
local
hextobytes
=
Cs
(
hextobyte
^
0
)
1134
local
bytestoHEX
=
Cs
(
bytetoHEX
^
0
)
1135
local
bytestohex
=
Cs
(
bytetohex
^
0
)
1136
local
bytestodec
=
Cs
(
bytetodec
^
0
)
1137 1138
patterns
.
hextobyte
=
hextobyte
1139
patterns
.
bytetoHEX
=
bytetoHEX
1140
patterns
.
bytetohex
=
bytetohex
1141
patterns
.
bytetodec
=
bytetodec
1142
patterns
.
hextobytes
=
hextobytes
1143
patterns
.
bytestoHEX
=
bytestoHEX
1144
patterns
.
bytestohex
=
bytestohex
1145
patterns
.
bytestodec
=
bytestodec
1146 1147
function
string
.
toHEX
(
s
)
1148
if
not
s
or
s
=
=
"
"
then
1149
return
s
1150
else
1151
return
lpegmatch
(
bytestoHEX
,
s
)
1152
end
1153
end
1154 1155
function
string
.
tohex
(
s
)
1156
if
not
s
or
s
=
=
"
"
then
1157
return
s
1158
else
1159
return
lpegmatch
(
bytestohex
,
s
)
1160
end
1161
end
1162 1163
function
string
.
todec
(
s
)
1164
if
not
s
or
s
=
=
"
"
then
1165
return
s
1166
else
1167
return
lpegmatch
(
bytestodec
,
s
)
1168
end
1169
end
1170 1171
function
string
.
tobytes
(
s
)
1172
if
not
s
or
s
=
=
"
"
then
1173
return
s
1174
else
1175
return
lpegmatch
(
hextobytes
,
s
)
1176
end
1177
end
1178 1179
-- local h = "ADFE0345"
1180
-- local b = lpegmatch(patterns.hextobytes,h)
1181
-- print(h,b,string.tohex(b),string.toHEX(b))
1182 1183
local
patterns
=
{
}
-- can be made weak
1184 1185
local
function
containsws
(
what
)
1186
local
p
=
patterns
[
what
]
1187
if
not
p
then
1188
local
p1
=
P
(
what
)
*
(
whitespace
+
endofstring
)
*
Cc
(
true
)
1189
local
p2
=
whitespace
*
P
(
p1
)
1190
p
=
P
(
p1
)
+
P
(
1
-
p2
)
^
0
*
p2
+
Cc
(
false
)
1191
patterns
[
what
]
=
p
1192
end
1193
return
p
1194
end
1195 1196
lpeg
.
containsws
=
containsws
1197 1198
function
string
.
containsws
(
str
,
what
)
1199
return
lpegmatch
(
patterns
[
what
]
or
containsws
(
what
)
,
str
)
1200
end
1201