lxml-tab.lua /size: 61 Kb    last modification: 2021-10-28 13:50
1
if
not
modules
then
modules
=
{
}
end
modules
[
'
lxml-tab
'
]
=
{
2
version
=
1
.
001
,
3
comment
=
"
this module is the basis for the lxml-* ones
"
,
4
author
=
"
Hans Hagen, PRAGMA-ADE, Hasselt NL
"
,
5
copyright
=
"
PRAGMA ADE / ConTeXt Development Team
"
,
6
license
=
"
see context related readme files
"
7
}
8 9
-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc
10
-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the
11
-- trouble
12 13
-- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua)
14
-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
15
-- of work so we delay this till we cleanup
16 17
local
trace_entities
=
false
trackers
.
register
(
"
xml.entities
"
,
function
(
v
)
trace_entities
=
v
end
)
18 19
local
report_xml
=
logs
and
logs
.
reporter
(
"
xml
"
,
"
core
"
)
or
function
(
...
)
print
(
string
.
format
(
...
)
)
end
20 21
--[[ldx-- 22<p>The parser used here is inspired by the variant discussed in the lua book, but 23handles comment and processing instructions, has a different structure, provides 24parent access; a first version used different trickery but was less optimized to we 25went this route. First we had a find based parser, now we have an <l n='lpeg'/> based one. 26The find based parser can be found in l-xml-edu.lua along with other older code.</p> 27--ldx]]
--
28 29
if
lpeg
.
setmaxstack
then
lpeg
.
setmaxstack
(
1000
)
end
-- deeply nested xml files
30 31
xml
=
xml
or
{
}
32
local
xml
=
xml
33 34
--~ local xml = xml
35 36
local
concat
,
remove
,
insert
=
table
.
concat
,
table
.
remove
,
table
.
insert
37
local
type
,
next
,
setmetatable
,
getmetatable
,
tonumber
,
rawset
,
select
=
type
,
next
,
setmetatable
,
getmetatable
,
tonumber
,
rawset
,
select
38
local
lower
,
find
,
match
,
gsub
=
string
.
lower
,
string
.
find
,
string
.
match
,
string
.
gsub
39
local
sort
=
table
.
sort
40
local
utfchar
=
utf
.
char
41
local
lpegmatch
,
lpegpatterns
=
lpeg
.
match
,
lpeg
.
patterns
42
local
P
,
S
,
R
,
C
,
V
,
C
,
Cs
=
lpeg
.
P
,
lpeg
.
S
,
lpeg
.
R
,
lpeg
.
C
,
lpeg
.
V
,
lpeg
.
C
,
lpeg
.
Cs
43
local
formatters
=
string
.
formatters
44 45
--[[ldx-- 46<p>First a hack to enable namespace resolving. A namespace is characterized by 47a <l n='url'/>. The following function associates a namespace prefix with a 48pattern. We use <l n='lpeg'/>, which in this case is more than twice as fast as a 49find based solution where we loop over an array of patterns. Less code and 50much cleaner.</p> 51--ldx]]
--
52 53
do
-- begin of namespace closure (we ran out of locals)
54 55
xml
.
xmlns
=
xml
.
xmlns
or
{
}
56 57
--[[ldx-- 58<p>The next function associates a namespace prefix with an <l n='url'/>. This 59normally happens independent of parsing.</p> 60 61<typing> 62xml.registerns("mml","mathml") 63</typing> 64--ldx]]
--
65 66
local
check
=
P
(
false
)
67
local
parse
=
check
68 69
function
xml
.
registerns
(
namespace
,
pattern
)
-- pattern can be an lpeg
70
check
=
check
+
C
(
P
(
lower
(
pattern
)
)
)
/
namespace
71
parse
=
P
{
P
(
check
)
+
1
*
V
(
1
)
}
72
end
73 74
--[[ldx-- 75<p>The next function also registers a namespace, but this time we map a 76given namespace prefix onto a registered one, using the given 77<l n='url'/>. This used for attributes like <t>xmlns:m</t>.</p> 78 79<typing> 80xml.checkns("m","http://www.w3.org/mathml") 81</typing> 82--ldx]]
--
83 84
function
xml
.
checkns
(
namespace
,
url
)
85
local
ns
=
lpegmatch
(
parse
,
lower
(
url
)
)
86
if
ns
and
namespace
~
=
ns
then
87
xml
.
xmlns
[
namespace
]
=
ns
88
end
89
end
90 91
--[[ldx-- 92<p>Next we provide a way to turn an <l n='url'/> into a registered 93namespace. This used for the <t>xmlns</t> attribute.</p> 94 95<typing> 96resolvedns = xml.resolvens("http://www.w3.org/mathml") 97</typing> 98 99This returns <t>mml</t>. 100--ldx]]
--
101 102
function
xml
.
resolvens
(
url
)
103
return
lpegmatch
(
parse
,
lower
(
url
)
)
or
"
"
104
end
105 106
--[[ldx-- 107<p>A namespace in an element can be remapped onto the registered 108one efficiently by using the <t>xml.xmlns</t> table.</p> 109--ldx]]
--
110 111
end
-- end of namespace closure
112 113
--[[ldx-- 114<p>This version uses <l n='lpeg'/>. We follow the same approach as before, stack and top and 115such. This version is about twice as fast which is mostly due to the fact that 116we don't have to prepare the stream for cdata, doctype etc etc. This variant is 117is dedicated to Luigi Scarso, who challenged me with 40 megabyte <l n='xml'/> files that 118took 12.5 seconds to load (1.5 for file io and the rest for tree building). With 119the <l n='lpeg'/> implementation we got that down to less 7.3 seconds. Loading the 14 120<l n='context'/> interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.</p> 121 122<p>Next comes the parser. The rather messy doctype definition comes in many 123disguises so it is no surprice that later on have to dedicate quite some 124<l n='lpeg'/> code to it.</p> 125 126<typing> 127<!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] > 128<!DOCTYPE Something PUBLIC "... ..." "..." > 129<!DOCTYPE Something SYSTEM "... ..." [ ... ] > 130<!DOCTYPE Something SYSTEM "... ..." > 131<!DOCTYPE Something [ ... ] > 132<!DOCTYPE Something > 133</typing> 134 135<p>The code may look a bit complex but this is mostly due to the fact that we 136resolve namespaces and attach metatables. There is only one public function:</p> 137 138<typing> 139local x = xml.convert(somestring) 140</typing> 141 142<p>An optional second boolean argument tells this function not to create a root 143element.</p> 144 145<p>Valid entities are:</p> 146 147<typing> 148<!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz> 149<!ENTITY xxxx PUBLIC "yyyy" > 150<!ENTITY xxxx "yyyy" > 151</typing> 152--ldx]]
--
153 154
-- not just one big nested table capture (lpeg overflow)
155 156
local
nsremap
,
resolvens
=
xml
.
xmlns
,
xml
.
resolvens
157 158
local
stack
,
level
,
top
,
at
,
xmlnms
,
errorstr
159
local
entities
,
parameters
160
local
strip
,
utfize
,
resolve
,
cleanup
,
resolve_predefined
,
unify_predefined
161
local
dcache
,
hcache
,
acache
162
local
mt
,
dt
,
nt
163
local
currentfilename
,
currentline
,
linenumbers
164 165
local
grammar_parsed_text_one
166
local
grammar_parsed_text_two
167
local
grammar_unparsed_text
168 169
local
handle_hex_entity
170
local
handle_dec_entity
171
local
handle_any_entity_dtd
172
local
handle_any_entity_text
173 174
local
function
preparexmlstate
(
settings
)
175
if
settings
then
176
linenumbers
=
settings
.
linenumbers
177
stack
=
{
}
178
level
=
0
179
top
=
{
}
180
at
=
{
}
181
mt
=
{
}
182
dt
=
{
}
183
nt
=
0
-- some 5% faster than #dt on cont-en.xml
184
xmlns
=
{
}
185
errorstr
=
nil
186
strip
=
settings
.
strip_cm_and_dt
187
utfize
=
settings
.
utfize_entities
188
resolve
=
settings
.
resolve_entities
-- enable this in order to apply the dtd
189
resolve_predefined
=
settings
.
resolve_predefined_entities
-- in case we have escaped entities
190
unify_predefined
=
settings
.
unify_predefined_entities
-- &#038; -> &amp;
191
cleanup
=
settings
.
text_cleanup
192
entities
=
settings
.
entities
or
{
}
193
currentfilename
=
settings
.
currentresource
194
currentline
=
1
195
parameters
=
{
}
196
reported_at_errors
=
{
}
197
dcache
=
{
}
198
hcache
=
{
}
199
acache
=
{
}
200
if
utfize
=
=
nil
then
201
settings
.
utfize_entities
=
true
202
utfize
=
true
203
end
204
if
resolve_predefined
=
=
nil
then
205
settings
.
resolve_predefined_entities
=
true
206
resolve_predefined
=
true
207
end
208
else
209
linenumbers
=
false
210
stack
=
nil
211
level
=
nil
212
top
=
nil
213
at
=
nil
214
mt
=
nil
215
dt
=
nil
216
nt
=
nil
217
xmlns
=
nil
218
errorstr
=
nil
219
strip
=
nil
220
utfize
=
nil
221
resolve
=
nil
222
resolve_predefined
=
nil
223
unify_predefined
=
nil
224
cleanup
=
nil
225
entities
=
nil
226
parameters
=
nil
227
reported_at_errors
=
nil
228
dcache
=
nil
229
hcache
=
nil
230
acache
=
nil
231
currentfilename
=
nil
232
currentline
=
1
233
end
234
end
235 236
local
function
initialize_mt
(
root
)
237
mt
=
{
__index
=
root
}
-- will be redefined later
238
end
239 240
function
xml
.
setproperty
(
root
,
k
,
v
)
241
getmetatable
(
root
)
.
__index
[
k
]
=
v
242
end
243 244
function
xml
.
checkerror
(
top
,
toclose
)
245
return
"
"
-- can be set
246
end
247 248
local
checkns
=
xml
.
checkns
249 250
local
function
add_attribute
(
namespace
,
tag
,
value
)
251
if
cleanup
and
value
~
=
"
"
then
252
value
=
cleanup
(
value
)
-- new
253
end
254
if
tag
=
=
"
xmlns
"
then
255
xmlns
[
#
xmlns
+
1
]
=
resolvens
(
value
)
256
at
[
tag
]
=
value
257
elseif
namespace
=
=
"
"
then
258
at
[
tag
]
=
value
259
elseif
namespace
=
=
"
xmlns
"
then
260
checkns
(
tag
,
value
)
261
at
[
"
xmlns:
"
.
.
tag
]
=
value
262
else
263
-- for the moment this way:
264
at
[
namespace
.
.
"
:
"
.
.
tag
]
=
value
265
end
266
end
267 268
local
function
add_empty
(
spacing
,
namespace
,
tag
)
269
if
spacing
~
=
"
"
then
270
nt
=
nt
+
1
271
dt
[
nt
]
=
spacing
272
end
273
local
resolved
=
namespace
=
=
"
"
and
xmlns
[
#
xmlns
]
or
nsremap
[
namespace
]
or
namespace
274
top
=
stack
[
level
]
275
dt
=
top
.
dt
276
nt
=
#
dt
+
1
277
local
t
=
linenumbers
and
{
278
ns
=
namespace
or
"
"
,
279
rn
=
resolved
,
280
tg
=
tag
,
281
at
=
at
,
282
dt
=
{
}
,
283
ni
=
nt
,
-- set slot, needed for css filtering
284
cf
=
currentfilename
,
285
cl
=
currentline
,
286
__p__
=
top
,
287
}
or
{
288
ns
=
namespace
or
"
"
,
289
rn
=
resolved
,
290
tg
=
tag
,
291
at
=
at
,
292
dt
=
{
}
,
293
ni
=
nt
,
-- set slot, needed for css filtering
294
__p__
=
top
,
295
}
296
dt
[
nt
]
=
t
297
setmetatable
(
t
,
mt
)
298
if
at
.
xmlns
then
299
remove
(
xmlns
)
300
end
301
at
=
{
}
302
end
303 304
local
function
add_begin
(
spacing
,
namespace
,
tag
)
305
if
spacing
~
=
"
"
then
306
nt
=
nt
+
1
307
dt
[
nt
]
=
spacing
308
end
309
local
resolved
=
namespace
=
=
"
"
and
xmlns
[
#
xmlns
]
or
nsremap
[
namespace
]
or
namespace
310
dt
=
{
}
311
top
=
linenumbers
and
{
312
ns
=
namespace
or
"
"
,
313
rn
=
resolved
,
314
tg
=
tag
,
315
at
=
at
,
316
dt
=
dt
,
317
ni
=
nil
,
-- preset slot, needed for css filtering
318
cf
=
currentfilename
,
319
cl
=
currentline
,
320
__p__
=
stack
[
level
]
,
321
}
or
{
322
ns
=
namespace
or
"
"
,
323
rn
=
resolved
,
324
tg
=
tag
,
325
at
=
at
,
326
dt
=
dt
,
327
ni
=
nil
,
-- preset slot, needed for css filtering
328
__p__
=
stack
[
level
]
,
329
}
330
setmetatable
(
top
,
mt
)
331
nt
=
0
332
level
=
level
+
1
333
stack
[
level
]
=
top
334
at
=
{
}
335
end
336 337
local
function
add_end
(
spacing
,
namespace
,
tag
)
338
if
spacing
~
=
"
"
then
339
nt
=
nt
+
1
340
dt
[
nt
]
=
spacing
341
end
342
local
toclose
=
stack
[
level
]
343
level
=
level
-
1
344
top
=
stack
[
level
]
345
if
level
<
1
then
346
errorstr
=
formatters
[
"
unable to close %s %s
"
]
(
tag
,
xml
.
checkerror
(
top
,
toclose
)
or
"
"
)
347
report_xml
(
errorstr
)
348
elseif
toclose
.
tg
~
=
tag
then
-- no namespace check
349
errorstr
=
formatters
[
"
unable to close %s with %s %s
"
]
(
toclose
.
tg
,
tag
,
xml
.
checkerror
(
top
,
toclose
)
or
"
"
)
350
report_xml
(
errorstr
)
351
end
352
dt
=
top
.
dt
353
nt
=
#
dt
+
1
354
dt
[
nt
]
=
toclose
355
toclose
.
ni
=
nt
-- update slot, needed for css filtering
356
if
toclose
.
at
.
xmlns
then
357
remove
(
xmlns
)
358
end
359
end
360 361
-- local spaceonly = lpegpatterns.whitespace^0 * P(-1)
362
--
363
-- will be an option: dataonly
364
--
365
-- if #text == 0 or lpegmatch(spaceonly,text) then
366
-- return
367
-- end
368 369
local
function
add_text
(
text
)
370
if
text
=
=
"
"
then
371
return
372
elseif
cleanup
then
373
if
nt
>
0
then
374
local
s
=
dt
[
nt
]
375
if
type
(
s
)
=
=
"
string
"
then
376
dt
[
nt
]
=
s
.
.
cleanup
(
text
)
377
else
378
nt
=
nt
+
1
379
dt
[
nt
]
=
cleanup
(
text
)
380
end
381
else
382
nt
=
1
383
dt
[
1
]
=
cleanup
(
text
)
384
end
385
else
386
if
nt
>
0
then
387
local
s
=
dt
[
nt
]
388
if
type
(
s
)
=
=
"
string
"
then
389
dt
[
nt
]
=
s
.
.
text
390
else
391
nt
=
nt
+
1
392
dt
[
nt
]
=
text
393
end
394
else
395
nt
=
1
396
dt
[
1
]
=
text
397
end
398
end
399
end
400 401
local
function
add_special
(
what
,
spacing
,
text
)
402
if
spacing
~
=
"
"
then
403
nt
=
nt
+
1
404
dt
[
nt
]
=
spacing
405
end
406
if
strip
and
(
what
=
=
"
@cm@
"
or
what
=
=
"
@dt@
"
)
then
407
-- forget it
408
else
409
nt
=
nt
+
1
410
dt
[
nt
]
=
linenumbers
and
{
411
special
=
true
,
412
ns
=
"
"
,
413
tg
=
what
,
414
ni
=
nil
,
-- preset slot
415
dt
=
{
text
}
,
416
cf
=
currentfilename
,
417
cl
=
currentline
,
418
}
or
{
419
special
=
true
,
420
ns
=
"
"
,
421
tg
=
what
,
422
ni
=
nil
,
-- preset slot
423
dt
=
{
text
}
,
424
}
425
end
426
end
427 428
local
function
set_message
(
txt
)
429
errorstr
=
"
garbage at the end of the file:
"
.
.
gsub
(
txt
,
"
([ \n\r\t]*)
"
,
"
"
)
430
end
431 432
local
function
attribute_value_error
(
str
)
433
if
not
reported_at_errors
[
str
]
then
434
report_xml
(
"
invalid attribute value %a
"
,
str
)
435
reported_at_errors
[
str
]
=
true
436
at
.
_error_
=
str
437
end
438
return
str
439
end
440 441
local
function
attribute_specification_error
(
str
)
442
if
not
reported_at_errors
[
str
]
then
443
report_xml
(
"
invalid attribute specification %a
"
,
str
)
444
reported_at_errors
[
str
]
=
true
445
at
.
_error_
=
str
446
end
447
return
str
448
end
449 450
-- I'm sure that this lpeg can be simplified (less captures) but it evolved ...
451
-- so i'm not going to change it now.
452 453
do
454 455
-- In order to overcome lua limitations we wrap entity stuff in a closure.
456 457
local
badentity
=
"
&
"
-- was "&error;"
458 459
xml
.
placeholders
=
{
460
unknown_dec_entity
=
function
(
str
)
return
str
=
=
"
"
and
badentity
or
formatters
[
"
&%s;
"
]
(
str
)
end
,
461
unknown_hex_entity
=
function
(
str
)
return
formatters
[
"
&#x%s;
"
]
(
str
)
end
,
462
unknown_any_entity
=
function
(
str
)
return
formatters
[
"
&#x%s;
"
]
(
str
)
end
,
463
}
464 465
local
function
fromhex
(
s
)
466
local
n
=
tonumber
(
s
,
16
)
467
if
n
then
468
return
utfchar
(
n
)
469
else
470
return
formatters
[
"
h:%s
"
]
(
s
)
,
true
471
end
472
end
473 474
local
function
fromdec
(
s
)
475
local
n
=
tonumber
(
s
)
476
if
n
then
477
return
utfchar
(
n
)
478
else
479
return
formatters
[
"
d:%s
"
]
(
s
)
,
true
480
end
481
end
482 483
local
p_rest
=
(
1
-
P
(
"
;
"
)
)
^
0
484
local
p_many
=
P
(
1
)
^
0
485 486
local
parsedentity
=
487
P
(
"
&#
"
)
*
(
P
(
"
x
"
)
*
(
p_rest
/
fromhex
)
+
(
p_rest
/
fromdec
)
)
*
P
(
"
;
"
)
*
P
(
-1
)
+
488
P
(
"
#
"
)
*
(
P
(
"
x
"
)
*
(
p_many
/
fromhex
)
+
(
p_many
/
fromdec
)
)
489 490
xml
.
parsedentitylpeg
=
parsedentity
491 492
-- parsing in the xml file
493 494
local
predefined_unified
=
{
495
[
38
]
=
"
&amp;
"
,
496
[
42
]
=
"
&quot;
"
,
497
[
47
]
=
"
&apos;
"
,
498
[
74
]
=
"
&lt;
"
,
499
[
76
]
=
"
&gt;
"
,
500
}
501 502
local
predefined_simplified
=
{
503
[
38
]
=
"
&
"
,
amp
=
"
&
"
,
504
[
42
]
=
'
"
'
,
quot
=
'
"
'
,
505
[
47
]
=
"
'
"
,
apos
=
"
'
"
,
506
[
74
]
=
"
<
"
,
lt
=
"
<
"
,
507
[
76
]
=
"
>
"
,
gt
=
"
>
"
,
508
}
509 510
local
nofprivates
=
0xF0000
-- shared but seldom used
511 512
local
privates_u
=
{
-- unescaped
513
[
[[
&
]]
]
=
"
&amp;
"
,
514
[
[[
"
]]
]
=
"
&quot;
"
,
515
[
[[
'
]]
]
=
"
&apos;
"
,
516
[
[[
<
]]
]
=
"
&lt;
"
,
517
[
[[
>
]]
]
=
"
&gt;
"
,
518
}
519 520
local
privates_p
=
{
-- needed for roundtrip as well as serialize to tex
521
}
522 523
local
privates_s
=
{
-- for tex
524
[
[[
"
]]
]
=
"
&U+22;
"
,
525
[
[[
#
]]
]
=
"
&U+23;
"
,
526
[
[[
$
]]
]
=
"
&U+24;
"
,
527
[
[[
%
]]
]
=
"
&U+25;
"
,
528
[
[[
&
]]
]
=
"
&U+26;
"
,
529
[
[[
'
]]
]
=
"
&U+27;
"
,
530
[
[[
<
]]
]
=
"
&U+3C;
"
,
531
[
[[
>
]]
]
=
"
&U+3E;
"
,
532
[
[[
\
]]
]
=
"
&U+5C;
"
,
533
[
[[
{
]]
]
=
"
&U+7B;
"
,
534
[
[[
|
]]
]
=
"
&U+7C;
"
,
535
[
[[
}
]]
]
=
"
&U+7D;
"
,
536
[
[[
~
]]
]
=
"
&U+7E;
"
,
537
}
538 539
local
privates_x
=
{
-- for xml
540
[
[[
"
]]
]
=
"
&U+22;
"
,
541
[
[[
#
]]
]
=
"
&U+23;
"
,
542
[
[[
$
]]
]
=
"
&U+24;
"
,
543
[
[[
%
]]
]
=
"
&U+25;
"
,
544
[
[[
'
]]
]
=
"
&U+27;
"
,
545
[
[[
\
]]
]
=
"
&U+5C;
"
,
546
[
[[
{
]]
]
=
"
&U+7B;
"
,
547
[
[[
|
]]
]
=
"
&U+7C;
"
,
548
[
[[
}
]]
]
=
"
&U+7D;
"
,
549
[
[[
~
]]
]
=
"
&U+7E;
"
,
550
}
551 552
local
privates_n
=
{
-- keeps track of defined ones
553
}
554 555
local
escaped
=
utf
.
remapper
(
privates_u
,
"
dynamic
"
)
556
local
unprivatized
=
utf
.
remapper
(
privates_p
,
"
dynamic
"
)
557
local
unspecialized
=
utf
.
remapper
(
privates_s
,
"
dynamic
"
)
558
local
despecialized
=
utf
.
remapper
(
privates_x
,
"
dynamic
"
)
559 560
xml
.
unprivatized
=
unprivatized
561
xml
.
unspecialized
=
unspecialized
562
xml
.
despecialized
=
despecialized
563
xml
.
escaped
=
escaped
564 565
local
function
unescaped
(
s
)
566
local
p
=
privates_n
[
s
]
567
if
not
p
then
568
nofprivates
=
nofprivates
+
1
569
p
=
utfchar
(
nofprivates
)
570
privates_n
[
s
]
=
p
571
s
=
"
&
"
.
.
s
.
.
"
;
"
-- todo: use char-ent to map to hex
572
privates_u
[
p
]
=
s
573
privates_p
[
p
]
=
s
574
privates_s
[
p
]
=
s
575
end
576
return
p
577
end
578 579
xml
.
privatetoken
=
unescaped
580
xml
.
privatecodes
=
privates_n
581
xml
.
specialcodes
=
privates_s
582 583
function
xml
.
addspecialcode
(
key
,
value
)
584
privates_s
[
key
]
=
value
or
"
&
"
.
.
s
.
.
"
;
"
585
end
586 587
handle_hex_entity
=
function
(
str
)
588
local
h
=
hcache
[
str
]
589
if
not
h
then
590
local
n
=
tonumber
(
str
,
16
)
591
h
=
unify_predefined
and
predefined_unified
[
n
]
592
if
h
then
593
if
trace_entities
then
594
report_xml
(
"
utfize, converting hex entity &#x%s; into %a
"
,
str
,
h
)
595
end
596
elseif
utfize
then
597
h
=
(
n
and
utfchar
(
n
)
)
or
xml
.
unknown_hex_entity
(
str
)
or
"
"
598
if
not
n
then
599
report_xml
(
"
utfize, ignoring hex entity &#x%s;
"
,
str
)
600
elseif
trace_entities
then
601
report_xml
(
"
utfize, converting hex entity &#x%s; into %a
"
,
str
,
h
)
602
end
603
else
604
if
trace_entities
then
605
report_xml
(
"
found entity &#x%s;
"
,
str
)
606
end
607
h
=
"
&#x
"
.
.
str
.
.
"
;
"
608
end
609
hcache
[
str
]
=
h
610
end
611
return
h
612
end
613 614
handle_dec_entity
=
function
(
str
)
615
local
d
=
dcache
[
str
]
616
if
not
d
then
617
local
n
=
tonumber
(
str
)
618
d
=
unify_predefined
and
predefined_unified
[
n
]
619
if
d
then
620
if
trace_entities
then
621
report_xml
(
"
utfize, converting dec entity &#%s; into %a
"
,
str
,
d
)
622
end
623
elseif
utfize
then
624
d
=
(
n
and
utfchar
(
n
)
)
or
placeholders
.
unknown_dec_entity
(
str
)
or
"
"
625
if
not
n
then
626
report_xml
(
"
utfize, ignoring dec entity &#%s;
"
,
str
)
627
elseif
trace_entities
then
628
report_xml
(
"
utfize, converting dec entity &#%s; into %a
"
,
str
,
d
)
629
end
630
else
631
if
trace_entities
then
632
report_xml
(
"
found entity &#%s;
"
,
str
)
633
end
634
d
=
"
&#
"
.
.
str
.
.
"
;
"
635
end
636
dcache
[
str
]
=
d
637
end
638
return
d
639
end
640 641
handle_any_entity_dtd
=
function
(
str
)
642
if
resolve
then
643
local
a
=
resolve_predefined
and
predefined_simplified
[
str
]
-- true by default
644
if
a
then
645
if
trace_entities
then
646
report_xml
(
"
resolving entity &%s; to predefined %a
"
,
str
,
a
)
647
end
648
else
649
if
type
(
resolve
)
=
=
"
function
"
then
650
a
=
resolve
(
str
,
entities
)
or
entities
[
str
]
651
else
652
a
=
entities
[
str
]
653
end
654
if
a
then
655
if
type
(
a
)
=
=
"
function
"
then
656
if
trace_entities
then
657
report_xml
(
"
expanding entity &%s; to function call
"
,
str
)
658
end
659
a
=
a
(
str
)
or
"
"
660
end
661
a
=
lpegmatch
(
parsedentity
,
a
)
or
a
-- for nested
662
if
trace_entities
then
663
report_xml
(
"
resolving entity &%s; to internal %a
"
,
str
,
a
)
664
end
665
else
666
local
unknown_any_entity
=
placeholders
.
unknown_any_entity
667
if
unknown_any_entity
then
668
a
=
unknown_any_entity
(
str
)
or
"
"
669
end
670
if
a
then
671
if
trace_entities
then
672
report_xml
(
"
resolving entity &%s; to external %s
"
,
str
,
a
)
673
end
674
else
675
if
trace_entities
then
676
report_xml
(
"
keeping entity &%s;
"
,
str
)
677
end
678
if
str
=
=
"
"
then
679
a
=
badentity
680
else
681
a
=
"
&
"
.
.
str
.
.
"
;
"
682
end
683
end
684
end
685
end
686
return
a
687
else
688
local
a
=
acache
[
str
]
689
if
not
a
then
690
a
=
resolve_predefined
and
predefined_simplified
[
str
]
691
if
a
then
692
-- one of the predefined
693
acache
[
str
]
=
a
694
if
trace_entities
then
695
report_xml
(
"
entity &%s; becomes %a
"
,
str
,
a
)
696
end
697
elseif
str
=
=
"
"
then
698
if
trace_entities
then
699
report_xml
(
"
invalid entity &%s;
"
,
str
)
700
end
701
a
=
badentity
702
acache
[
str
]
=
a
703
else
704
if
trace_entities
then
705
report_xml
(
"
entity &%s; is made private
"
,
str
)
706
end
707
-- a = "&" .. str .. ";"
708
a
=
unescaped
(
str
)
709
acache
[
str
]
=
a
710
end
711
end
712
return
a
713
end
714
end
715 716
handle_any_entity_text
=
function
(
str
)
717
if
resolve
then
718
local
a
=
resolve_predefined
and
predefined_simplified
[
str
]
719
if
a
then
720
if
trace_entities
then
721
report_xml
(
"
resolving entity &%s; to predefined %a
"
,
str
,
a
)
722
end
723
else
724
if
type
(
resolve
)
=
=
"
function
"
then
725
a
=
resolve
(
str
,
entities
)
or
entities
[
str
]
726
else
727
a
=
entities
[
str
]
728
end
729
if
a
then
730
if
type
(
a
)
=
=
"
function
"
then
731
if
trace_entities
then
732
report_xml
(
"
expanding entity &%s; to function call
"
,
str
)
733
end
734
a
=
a
(
str
)
or
"
"
735
end
736
a
=
lpegmatch
(
grammar_parsed_text_two
,
a
)
or
a
737
if
type
(
a
)
=
=
"
number
"
then
738
return
"
"
739
else
740
a
=
lpegmatch
(
parsedentity
,
a
)
or
a
-- for nested
741
if
trace_entities
then
742
report_xml
(
"
resolving entity &%s; to internal %a
"
,
str
,
a
)
743
end
744
end
745
if
trace_entities
then
746
report_xml
(
"
resolving entity &%s; to internal %a
"
,
str
,
a
)
747
end
748
else
749
local
unknown_any_entity
=
placeholders
.
unknown_any_entity
750
if
unknown_any_entity
then
751
a
=
unknown_any_entity
(
str
)
or
"
"
752
end
753
if
a
then
754
if
trace_entities
then
755
report_xml
(
"
resolving entity &%s; to external %s
"
,
str
,
a
)
756
end
757
else
758
if
trace_entities
then
759
report_xml
(
"
keeping entity &%s;
"
,
str
)
760
end
761
if
str
=
=
"
"
then
762
a
=
badentity
763
else
764
a
=
"
&
"
.
.
str
.
.
"
;
"
765
end
766
end
767
end
768
end
769
return
a
770
else
771
local
a
=
acache
[
str
]
772
if
not
a
then
773
a
=
resolve_predefined
and
predefined_simplified
[
str
]
774
if
a
then
775
-- one of the predefined
776
acache
[
str
]
=
a
777
if
trace_entities
then
778
report_xml
(
"
entity &%s; becomes %a
"
,
str
,
a
)
779
end
780
elseif
str
=
=
"
"
then
781
if
trace_entities
then
782
report_xml
(
"
invalid entity &%s;
"
,
str
)
783
end
784
a
=
badentity
785
acache
[
str
]
=
a
786
else
787
if
trace_entities
then
788
report_xml
(
"
entity &%s; is made private
"
,
str
)
789
end
790
-- a = "&" .. str .. ";"
791
a
=
unescaped
(
str
)
792
acache
[
str
]
=
a
793
end
794
end
795
return
a
796
end
797
end
798 799
-- for tex
800 801
local
p_rest
=
(
1
-
P
(
"
;
"
)
)
^
1
802 803
local
spec
=
{
804
[
0x23
]
=
"
\\Ux{23}
"
,
-- #
805
[
0x24
]
=
"
\\Ux{24}
"
,
-- $
806
[
0x25
]
=
"
\\Ux{25}
"
,
-- %
807
[
0x5C
]
=
"
\\Ux{5C}
"
,
-- \
808
[
0x7B
]
=
"
\\Ux{7B}
"
,
-- {
809
[
0x7C
]
=
"
\\Ux{7C}
"
,
-- |
810
[
0x7D
]
=
"
\\Ux{7D}
"
,
-- }
811
[
0x7E
]
=
"
\\Ux{7E}
"
,
-- ~
812
}
813 814
local
hash
=
table
.
setmetatableindex
(
spec
,
function
(
t
,
k
)
815
local
v
=
utfchar
(
k
)
816
t
[
k
]
=
v
817
return
v
818
end
)
819 820
local
function
fromuni
(
s
)
821
local
n
=
tonumber
(
s
,
16
)
822
if
n
then
823
return
hash
[
n
]
824
else
825
return
formatters
[
"
u:%s
"
]
(
s
)
,
true
826
end
827
end
828 829
local
function
fromhex
(
s
)
830
local
n
=
tonumber
(
s
,
16
)
831
if
n
then
832
return
hash
[
n
]
833
else
834
return
formatters
[
"
h:%s
"
]
(
s
)
,
true
835
end
836
end
837 838
local
function
fromdec
(
s
)
839
local
n
=
tonumber
(
s
)
840
if
n
then
841
return
hash
[
n
]
842
else
843
return
formatters
[
"
d:%s
"
]
(
s
)
,
true
844
end
845
end
846 847
local
reparsedentity
=
848
P
(
"
U+
"
)
*
(
p_rest
/
fromuni
)
849
+
P
(
"
#
"
)
*
(
850
P
(
"
x
"
)
*
(
p_rest
/
fromhex
)
851
+
p_rest
/
fromdec
852
)
853 854
local
hash
=
table
.
setmetatableindex
(
function
(
t
,
k
)
855
local
v
=
utfchar
(
k
)
856
t
[
k
]
=
v
857
return
v
858
end
)
859 860
local
function
fromuni
(
s
)
861
local
n
=
tonumber
(
s
,
16
)
862
if
n
then
863
return
hash
[
n
]
864
else
865
return
formatters
[
"
u:%s
"
]
(
s
)
,
true
866
end
867
end
868 869
local
function
fromhex
(
s
)
870
local
n
=
tonumber
(
s
,
16
)
871
if
n
then
872
return
hash
[
n
]
873
else
874
return
formatters
[
"
h:%s
"
]
(
s
)
,
true
875
end
876
end
877 878
local
function
fromdec
(
s
)
879
local
n
=
tonumber
(
s
)
880
if
n
then
881
return
hash
[
n
]
882
else
883
return
formatters
[
"
d:%s
"
]
(
s
)
,
true
884
end
885
end
886 887
local
unescapedentity
=
888
P
(
"
U+
"
)
*
(
p_rest
/
fromuni
)
889
+
P
(
"
#
"
)
*
(
890
P
(
"
x
"
)
*
(
p_rest
/
fromhex
)
891
+
p_rest
/
fromdec
892
)
893 894
xml
.
reparsedentitylpeg
=
reparsedentity
-- with \Ux{...} for special tex entities
895
xml
.
unescapedentitylpeg
=
unescapedentity
-- normal characters
896 897
end
898 899
-- we use these later on
900 901
local
escaped
=
xml
.
escaped
902
local
unescaped
=
xml
.
unescaped
903
local
placeholders
=
xml
.
placeholders
904 905
--
906 907
local
function
handle_end_entity
(
str
)
908
report_xml
(
"
error in entity, %a found without ending %a
"
,
str
,
"
;
"
)
909
return
str
910
end
911 912
local
function
handle_crap_error
(
chr
)
913
report_xml
(
"
error in parsing, unexpected %a found
"
,
chr
)
914
add_text
(
chr
)
915
return
chr
916
end
917 918
local
function
handlenewline
(
)
919
currentline
=
currentline
+
1
920
end
921 922
-- first = ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#x00F8-#x02FF] |
923
-- [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
924
-- [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
925
-- [#x10000-#xEFFFF]
926
-- rest = "-" | "." | [0-9] | #xB7 | [#x300-#x36F] | [#x203F-#x2040]
927
-- name = first + (first + rest)^1
928
--
929
-- We assume utf and do no real checking!
930 931
local
spacetab
=
S
(
'
\t
'
)
932
local
space
=
S
(
'
\r\n\t
'
)
933
local
newline
=
lpegpatterns
.
newline
/
handlenewline
934
local
anything
=
P
(
1
)
935
local
open
=
P
(
'
<
'
)
936
local
close
=
P
(
'
>
'
)
937
local
squote
=
S
(
"
'
"
)
938
local
dquote
=
S
(
'
"
'
)
939
local
equal
=
P
(
'
=
'
)
940
local
slash
=
P
(
'
/
'
)
941
local
colon
=
P
(
'
:
'
)
942
local
semicolon
=
P
(
'
;
'
)
943
local
ampersand
=
P
(
'
&
'
)
944
----- valid_0 = lpegpatterns.utf8two + lpegpatterns.utf8three + lpegpatterns.utf8four
945
local
valid_0
=
R
(
"
\128\255
"
)
-- basically any encoding without checking (fast)
946
local
valid_1
=
R
(
'
az
'
,
'
AZ
'
)
+
S
(
'
_
'
)
+
valid_0
947
local
valid_2
=
valid_1
+
R
(
'
09
'
)
+
S
(
'
-.
'
)
948
local
valid
=
valid_1
*
valid_2
^
0
949
local
name_yes
=
C
(
valid
^
1
)
*
colon
*
C
(
valid
^
1
)
950
local
name_nop
=
C
(
P
(
true
)
)
*
C
(
valid
^
1
)
951
local
name
=
name_yes
+
name_nop
952
local
utfbom
=
lpegpatterns
.
utfbom
-- no capture
953
local
spacing
=
C
(
space
^
0
)
954 955
local
space_nl
=
spacetab
+
newline
956
local
spacing_nl
=
Cs
(
(
space_nl
)
^
0
)
957
local
anything_nl
=
newline
+
P
(
1
)
958 959
local
function
weirdentity
(
k
,
v
)
960
if
trace_entities
then
961
report_xml
(
"
registering %s entity %a as %a
"
,
"
weird
"
,
k
,
v
)
962
end
963
parameters
[
k
]
=
v
964
end
965
local
function
normalentity
(
k
,
v
)
966
if
trace_entities
then
967
report_xml
(
"
registering %s entity %a as %a
"
,
"
normal
"
,
k
,
v
)
968
end
969
entities
[
k
]
=
v
970
end
971
local
function
systementity
(
k
,
v
,
n
)
972
if
trace_entities
then
973
report_xml
(
"
registering %s entity %a as %a
"
,
"
system
"
,
k
,
v
)
974
end
975
entities
[
k
]
=
v
976
end
977
local
function
publicentity
(
k
,
v
,
n
)
978
if
trace_entities
then
979
report_xml
(
"
registering %s entity %a as %a
"
,
"
public
"
,
k
,
v
)
980
end
981
entities
[
k
]
=
v
982
end
983
local
function
entityfile
(
pattern
,
k
,
v
,
n
)
984
if
n
then
985
local
okay
,
data
986
local
loadbinfile
=
resolvers
and
resolvers
.
loadbinfile
987
if
loadbinfile
then
988
okay
,
data
=
loadbinfile
(
n
)
989
else
990
data
=
io
.
loaddata
(
n
)
991
okay
=
data
and
data
~
=
"
"
992
end
993
if
okay
then
994
if
trace_entities
then
995
report_xml
(
"
loading public entities %a as %a from %a
"
,
k
,
v
,
n
)
996
end
997
lpegmatch
(
pattern
,
data
)
998
return
999
end
1000
end
1001
report_xml
(
"
ignoring public entities %a as %a from %a
"
,
k
,
v
,
n
)
1002
end
1003 1004
local
function
install
(
spacenewline
,
spacing
,
anything
)
1005 1006
local
anyentitycontent
=
(
1
-
open
-
semicolon
-
space
-
close
-
ampersand
)
^
0
1007
local
hexentitycontent
=
R
(
"
AF
"
,
"
af
"
,
"
09
"
)
^
1
1008
local
decentitycontent
=
R
(
"
09
"
)
^
1
1009
local
parsedentity
=
P
(
"
#
"
)
/
"
"
*
(
1010
P
(
"
x
"
)
/
"
"
*
(
hexentitycontent
/
handle_hex_entity
)
+
1011
(
decentitycontent
/
handle_dec_entity
)
1012
)
+
(
anyentitycontent
/
handle_any_entity_dtd
)
-- can be Cc(true)
1013
local
parsedentity_text
=
P
(
"
#
"
)
/
"
"
*
(
1014
P
(
"
x
"
)
/
"
"
*
(
hexentitycontent
/
handle_hex_entity
)
+
1015
(
decentitycontent
/
handle_dec_entity
)
1016
)
+
(
anyentitycontent
/
handle_any_entity_text
)
-- can be Cc(false)
1017
local
entity
=
(
ampersand
/
"
"
)
*
parsedentity
*
(
semicolon
/
"
"
)
1018
+
ampersand
*
(
anyentitycontent
/
handle_end_entity
)
1019
local
entity_text
=
(
ampersand
/
"
"
)
*
parsedentity_text
*
(
semicolon
/
"
"
)
1020
+
ampersand
*
(
anyentitycontent
/
handle_end_entity
)
1021 1022
local
text_unparsed
=
Cs
(
(
anything
-
open
)
^
1
)
1023
local
text_parsed
=
(
Cs
(
(
anything
-
open
-
ampersand
)
^
1
)
/
add_text
+
Cs
(
entity_text
)
/
add_text
)
^
1
1024
-- local text_parsed = ((Cs(((anything-open-ampersand)^1) + entity_text))/add_text)^1
1025 1026
local
somespace
=
(
spacenewline
)
^
1
1027
local
optionalspace
=
(
spacenewline
)
^
0
1028 1029
local
value
=
(
squote
*
Cs
(
(
entity
+
(
anything
-
squote
)
)
^
0
)
*
squote
)
+
(
dquote
*
Cs
(
(
entity
+
(
anything
-
dquote
)
)
^
0
)
*
dquote
)
-- ampersand and < also invalid in value
1030 1031
local
endofattributes
=
slash
*
close
+
close
-- recovery of flacky html
1032
local
whatever
=
space
*
name
*
optionalspace
*
equal
1033
local
wrongvalue
=
Cs
(
P
(
entity
+
(
1
-
space
-
endofattributes
)
)
^
1
)
/
attribute_value_error
1034 1035
local
attributevalue
=
value
+
wrongvalue
1036 1037
local
attribute
=
(
somespace
*
name
*
optionalspace
*
equal
*
optionalspace
*
attributevalue
)
/
add_attribute
1038 1039
-- local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
1040
local
attributes
=
(
attribute
+
somespace
^
-1
*
(
(
(
anything
-
endofattributes
)
^
1
)
/
attribute_specification_error
)
)
^
0
1041 1042
local
parsedtext
=
text_parsed
-- / add_text
1043
local
unparsedtext
=
text_unparsed
/
add_text
1044
local
balanced
=
P
{
"
[
"
*
(
(
anything
-
S
"
[]
"
)
+
V
(
1
)
)
^
0
*
"
]
"
}
-- taken from lpeg manual, () example
1045 1046
-- todo: combine empty and begin so that we scan attributes only once .. maybe also go for match time captures
1047 1048
local
emptyelement
=
(
spacing
*
open
*
name
*
attributes
*
optionalspace
*
slash
*
close
)
/
add_empty
1049
local
beginelement
=
(
spacing
*
open
*
name
*
attributes
*
optionalspace
*
close
)
/
add_begin
1050
local
endelement
=
(
spacing
*
open
*
slash
*
name
*
optionalspace
*
close
)
/
add_end
1051 1052
-- local commonelement = spacing * open * name * attributes * optionalspace *
1053
-- local cemptyelement = (slash * close) / add_empty
1054
-- local cbeginelement = ( * close) / add_begin
1055 1056
-- todo: combine the opens in:
1057 1058
local
begincomment
=
open
*
P
(
"
!--
"
)
1059
local
endcomment
=
P
(
"
--
"
)
*
close
1060
local
begininstruction
=
open
*
P
(
"
?
"
)
1061
local
endinstruction
=
P
(
"
?
"
)
*
close
1062
local
begincdata
=
open
*
P
(
"
![CDATA[
"
)
1063
local
endcdata
=
P
(
"
]]
"
)
*
close
1064 1065
local
someinstruction
=
C
(
(
anything
-
endinstruction
)
^
0
)
1066
local
somecomment
=
C
(
(
anything
-
endcomment
)
^
0
)
1067
local
somecdata
=
C
(
(
anything
-
endcdata
)
^
0
)
1068 1069
-- todo: separate dtd parser
1070 1071
local
begindoctype
=
open
*
P
(
"
!DOCTYPE
"
)
1072
local
enddoctype
=
close
1073
local
beginset
=
P
(
"
[
"
)
1074
local
endset
=
P
(
"
]
"
)
1075
local
wrdtypename
=
C
(
(
anything
-
somespace
-
P
(
"
;
"
)
)
^
1
)
1076
local
doctypename
=
C
(
(
anything
-
somespace
-
close
)
^
0
)
1077
local
elementdoctype
=
optionalspace
*
P
(
"
<!ELEMENT
"
)
*
(
anything
-
close
)
^
0
*
close
1078 1079
local
basiccomment
=
begincomment
*
(
(
anything
-
endcomment
)
^
0
)
*
endcomment
1080 1081
local
weirdentitytype
=
P
(
"
%
"
)
*
(
somespace
*
doctypename
*
somespace
*
value
)
/
weirdentity
1082
local
normalentitytype
=
(
doctypename
*
somespace
*
value
)
/
normalentity
1083
local
publicentitytype
=
(
doctypename
*
somespace
*
P
(
"
PUBLIC
"
)
*
somespace
*
value
)
/
publicentity
1084 1085
local
systementitytype
=
(
doctypename
*
somespace
*
P
(
"
SYSTEM
"
)
*
somespace
*
value
*
somespace
*
P
(
"
NDATA
"
)
*
somespace
*
doctypename
)
/
systementity
1086
local
entitydoctype
=
optionalspace
*
P
(
"
<!ENTITY
"
)
*
somespace
*
(
systementitytype
+
publicentitytype
+
normalentitytype
+
weirdentitytype
)
*
optionalspace
*
close
1087 1088
local
publicentityfile
=
(
doctypename
*
somespace
*
P
(
"
PUBLIC
"
)
*
somespace
*
value
*
(
somespace
*
value
)
^
0
)
/
function
(
...
)
1089
entityfile
(
entitydoctype
,
...
)
1090
end
1091 1092
local
function
weirdresolve
(
s
)
1093
lpegmatch
(
entitydoctype
,
parameters
[
s
]
)
1094
end
1095 1096
local
function
normalresolve
(
s
)
1097
lpegmatch
(
entitydoctype
,
entities
[
s
]
)
1098
end
1099 1100
local
entityresolve
=
P
(
"
%
"
)
*
(
wrdtypename
/
weirdresolve
)
*
P
(
"
;
"
)
1101
+
P
(
"
&
"
)
*
(
wrdtypename
/
normalresolve
)
*
P
(
"
;
"
)
1102 1103
entitydoctype
=
entitydoctype
+
entityresolve
1104 1105
-- we accept comments in doctypes
1106 1107
local
doctypeset
=
beginset
*
optionalspace
*
P
(
elementdoctype
+
entitydoctype
+
entityresolve
+
basiccomment
+
space
)
^
0
*
optionalspace
*
endset
1108
local
definitiondoctype
=
doctypename
*
somespace
*
doctypeset
1109
local
publicdoctype
=
doctypename
*
somespace
*
P
(
"
PUBLIC
"
)
*
somespace
*
value
*
somespace
*
value
*
somespace
*
doctypeset
1110
local
systemdoctype
=
doctypename
*
somespace
*
P
(
"
SYSTEM
"
)
*
somespace
*
value
*
somespace
*
doctypeset
1111
local
simpledoctype
=
(
anything
-
close
)
^
1
-- * balanced^0
1112
local
somedoctype
=
C
(
(
somespace
*
(
publicentityfile
+
publicdoctype
+
systemdoctype
+
definitiondoctype
+
simpledoctype
)
*
optionalspace
)
^
0
)
1113 1114
local
instruction
=
(
spacing
*
begininstruction
*
someinstruction
*
endinstruction
)
/
function
(
...
)
add_special
(
"
@pi@
"
,
...
)
end
1115
local
comment
=
(
spacing
*
begincomment
*
somecomment
*
endcomment
)
/
function
(
...
)
add_special
(
"
@cm@
"
,
...
)
end
1116
local
cdata
=
(
spacing
*
begincdata
*
somecdata
*
endcdata
)
/
function
(
...
)
add_special
(
"
@cd@
"
,
...
)
end
1117
local
doctype
=
(
spacing
*
begindoctype
*
somedoctype
*
enddoctype
)
/
function
(
...
)
add_special
(
"
@dt@
"
,
...
)
end
1118 1119
local
crap_parsed
=
anything
-
beginelement
-
endelement
-
emptyelement
-
begininstruction
-
begincomment
-
begincdata
-
ampersand
1120
local
crap_unparsed
=
anything
-
beginelement
-
endelement
-
emptyelement
-
begininstruction
-
begincomment
-
begincdata
1121 1122
local
parsedcrap
=
Cs
(
(
crap_parsed
^
1
+
entity_text
)
^
1
)
/
handle_crap_error
1123
local
parsedcrap
=
Cs
(
(
crap_parsed
^
1
+
entity_text
)
^
1
)
/
handle_crap_error
1124
local
unparsedcrap
=
Cs
(
(
crap_unparsed
)
^
1
)
/
handle_crap_error
1125 1126
-- nicer but slower:
1127
--
1128
-- local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
1129
-- local comment = (Cc("@cm@") * spacing * begincomment * somecomment * endcomment ) / add_special
1130
-- local cdata = (Cc("@cd@") * spacing * begincdata * somecdata * endcdata ) / add_special
1131
-- local doctype = (Cc("@dt@") * spacing * begindoctype * somedoctype * enddoctype ) / add_special
1132 1133
local
trailer
=
space
^
0
*
(
text_unparsed
/
set_message
)
^
0
1134 1135
-- comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
1136
-- text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
1137
-- text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
1138 1139
-- local grammar_parsed_text = P { "preamble",
1140
-- preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
1141
-- parent = beginelement * V("children")^0 * endelement,
1142
-- children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
1143
-- }
1144 1145
local
grammar_parsed_text_one
=
P
{
"
preamble
"
,
1146
preamble
=
utfbom
^
0
*
instruction
^
0
*
(
doctype
+
comment
+
instruction
)
^
0
,
1147
}
1148 1149
local
grammar_parsed_text_two
=
P
{
"
followup
"
,
1150
followup
=
V
(
"
parent
"
)
*
trailer
,
1151
parent
=
beginelement
*
V
(
"
children
"
)
^
0
*
endelement
,
1152
children
=
parsedtext
+
V
(
"
parent
"
)
+
emptyelement
+
comment
+
cdata
+
instruction
+
parsedcrap
,
1153
}
1154 1155
-- local grammar_parsed_text_two = P { "followup",
1156
-- followup = beginelement * V("children")^0 * endelement * trailer,
1157
-- children = parsedtext + beginelement * V("children")^0 * endelement + emptyelement + comment + cdata + instruction + parsedcrap,
1158
-- }
1159 1160
-- local grammar_parsed_text_two = P { "followup",
1161
-- followup = commonelement * cbeginelement * V("children")^0 * endelement * trailer,
1162
-- children = parsedtext + commonelement * (cbeginelement * V("children")^0 * endelement + cemptyelement) + comment + cdata + instruction + parsedcrap,
1163
-- }
1164 1165
local
grammar_unparsed_text
=
P
{
"
preamble
"
,
1166
preamble
=
utfbom
^
0
*
instruction
^
0
*
(
doctype
+
comment
+
instruction
)
^
0
*
V
(
"
parent
"
)
*
trailer
,
1167
parent
=
beginelement
*
V
(
"
children
"
)
^
0
*
endelement
,
1168
children
=
unparsedtext
+
V
(
"
parent
"
)
+
emptyelement
+
comment
+
cdata
+
instruction
+
unparsedcrap
,
1169
}
1170 1171
return
grammar_parsed_text_one
,
grammar_parsed_text_two
,
grammar_unparsed_text
1172 1173
end
1174 1175
local
1176
grammar_parsed_text_one_nop
,
1177
grammar_parsed_text_two_nop
,
1178
grammar_unparsed_text_nop
=
install
(
space
,
spacing
,
anything
)
1179 1180
local
1181
grammar_parsed_text_one_yes
,
1182
grammar_parsed_text_two_yes
,
1183
grammar_unparsed_text_yes
=
install
(
space_nl
,
spacing_nl
,
anything_nl
)
1184 1185
-- maybe we will add settings to result as well
1186 1187
local
function
_xmlconvert_
(
data
,
settings
,
detail
)
1188
settings
=
settings
or
{
}
-- no_root strip_cm_and_dt given_entities parent_root error_handler
1189
preparexmlstate
(
settings
)
1190
if
settings
.
linenumbers
then
1191
grammar_parsed_text_one
=
grammar_parsed_text_one_yes
1192
grammar_parsed_text_two
=
grammar_parsed_text_two_yes
1193
grammar_unparsed_text
=
grammar_unparsed_text_yes
1194
else
1195
grammar_parsed_text_one
=
grammar_parsed_text_one_nop
1196
grammar_parsed_text_two
=
grammar_parsed_text_two_nop
1197
grammar_unparsed_text
=
grammar_unparsed_text_nop
1198
end
1199
local
preprocessor
=
settings
.
preprocessor
1200
if
data
and
data
~
=
"
"
and
type
(
preprocessor
)
=
=
"
function
"
then
1201
data
=
preprocessor
(
data
,
settings
)
or
data
-- settings.currentresource
1202
end
1203
if
settings
.
parent_root
then
1204
mt
=
getmetatable
(
settings
.
parent_root
)
1205
else
1206
initialize_mt
(
top
)
1207
end
1208
level
=
level
+
1
1209
stack
[
level
]
=
top
1210
top
.
dt
=
{
}
1211
dt
=
top
.
dt
1212
nt
=
0
1213
if
not
data
or
data
=
=
"
"
then
1214
errorstr
=
"
empty xml file
"
1215
elseif
data
=
=
true
then
1216
errorstr
=
detail
or
"
problematic xml file
"
1217
elseif
utfize
or
resolve
then
1218
local
m
=
lpegmatch
(
grammar_parsed_text_one
,
data
)
1219
if
m
then
1220
m
=
lpegmatch
(
grammar_parsed_text_two
,
data
,
m
)
1221
end
1222
-- local m = lpegmatch(grammar_parsed_text,data)
1223
if
m
then
1224
-- errorstr = "" can be set!
1225
else
1226
errorstr
=
"
invalid xml file - parsed text
"
1227
end
1228
elseif
type
(
data
)
=
=
"
string
"
then
1229
if
lpegmatch
(
grammar_unparsed_text
,
data
)
then
1230
errorstr
=
"
"
1231
else
1232
errorstr
=
"
invalid xml file - unparsed text
"
1233
end
1234
else
1235
errorstr
=
"
invalid xml file - no text at all
"
1236
end
1237
local
result
1238
if
errorstr
and
errorstr
~
=
"
"
then
1239
result
=
{
dt
=
{
{
ns
=
"
"
,
tg
=
"
error
"
,
dt
=
{
errorstr
}
,
at
=
{
}
,
er
=
true
}
}
}
1240
setmetatable
(
result
,
mt
)
1241
setmetatable
(
result
.
dt
[
1
]
,
mt
)
1242
setmetatable
(
stack
,
mt
)
1243
local
errorhandler
=
settings
.
error_handler
1244
if
errorhandler
=
=
false
then
1245
-- no error message
1246
else
1247
errorhandler
=
errorhandler
or
xml
.
errorhandler
1248
if
errorhandler
then
1249
local
currentresource
=
settings
.
currentresource
1250
if
currentresource
and
currentresource
~
=
"
"
then
1251
xml
.
errorhandler
(
formatters
[
"
load error in [%s]: %s
"
]
(
currentresource
,
errorstr
)
,
currentresource
)
1252
else
1253
xml
.
errorhandler
(
formatters
[
"
load error: %s
"
]
(
errorstr
)
)
1254
end
1255
end
1256
end
1257
else
1258
result
=
stack
[
1
]
1259
end
1260
if
not
settings
.
no_root
then
1261
result
=
{
special
=
true
,
ns
=
"
"
,
tg
=
'
@rt@
'
,
dt
=
result
.
dt
,
at
=
{
}
,
entities
=
entities
,
settings
=
settings
}
1262
setmetatable
(
result
,
mt
)
1263
local
rdt
=
result
.
dt
1264
for
k
=
1
,
#
rdt
do
1265
local
v
=
rdt
[
k
]
1266
if
type
(
v
)
=
=
"
table
"
and
not
v
.
special
then
-- always table -)
1267
result
.
ri
=
k
-- rootindex
1268
v
.
__p__
=
result
-- new, experiment, else we cannot go back to settings, we need to test this !
1269
break
1270
end
1271
end
1272
end
1273
if
errorstr
and
errorstr
~
=
"
"
then
1274
result
.
error
=
true
1275
else
1276
errorstr
=
nil
1277
end
1278
result
.
statistics
=
{
1279
errormessage
=
errorstr
,
1280
entities
=
{
1281
decimals
=
dcache
,
1282
hexadecimals
=
hcache
,
1283
names
=
acache
,
1284
intermediates
=
parameters
,
1285
}
1286
}
1287
preparexmlstate
(
)
-- resets
1288
return
result
1289
end
1290 1291
-- Because we can have a crash (stack issues) with faulty xml, we wrap this one
1292
-- in a protector:
1293 1294
local
function
xmlconvert
(
data
,
settings
)
1295
local
ok
,
result
=
pcall
(
function
(
)
return
_xmlconvert_
(
data
,
settings
)
end
)
1296
if
ok
then
1297
return
result
1298
elseif
type
(
result
)
=
=
"
string
"
then
1299
return
_xmlconvert_
(
true
,
settings
,
result
)
1300
else
1301
return
_xmlconvert_
(
true
,
settings
)
1302
end
1303
end
1304 1305
xml
.
convert
=
xmlconvert
1306 1307
function
xml
.
inheritedconvert
(
data
,
xmldata
,
cleanup
)
-- xmldata is parent
1308
local
settings
=
xmldata
.
settings
1309
if
settings
then
1310
settings
.
parent_root
=
xmldata
-- to be tested
1311
end
1312
-- settings.no_root = true
1313
local
xc
=
xmlconvert
(
data
,
settings
)
-- hm, we might need to locate settings
1314
if
cleanup
then
1315
local
x
=
xc
.
dt
1316
if
x
then
1317
x
=
x
[
1
]
1318
if
x
and
x
.
tg
=
=
"
@pi@
"
then
1319
local
dt
=
x
.
dt
1320
local
pi
=
dt
and
dt
[
1
]
1321
if
type
(
pi
)
=
=
"
string
"
and
find
(
pi
,
"
^xml
"
)
then
1322
remove
(
dt
,
1
)
1323
end
1324
end
1325
end
1326
end
1327
-- xc.settings = nil
1328
-- xc.entities = nil
1329
-- xc.special = nil
1330
-- xc.ri = nil
1331
-- print(xc.tg)
1332
return
xc
1333
end
1334 1335
--[[ldx-- 1336<p>Packaging data in an xml like table is done with the following 1337function. Maybe it will go away (when not used).</p> 1338--ldx]]
--
1339 1340
function
xml
.
is_valid
(
root
)
1341
return
root
and
root
.
dt
and
root
.
dt
[
1
]
and
type
(
root
.
dt
[
1
]
)
=
=
"
table
"
and
not
root
.
dt
[
1
]
.
er
1342
end
1343 1344
function
xml
.
package
(
tag
,
attributes
,
data
)
1345
local
ns
,
tg
=
match
(
tag
,
"
^(.-):?([^:]+)$
"
)
1346
local
t
=
{
ns
=
ns
,
tg
=
tg
,
dt
=
data
or
"
"
,
at
=
attributes
or
{
}
}
1347
setmetatable
(
t
,
mt
)
1348
return
t
1349
end
1350 1351
function
xml
.
is_valid
(
root
)
1352
return
root
and
not
root
.
error
1353
end
1354 1355
xml
.
errorhandler
=
report_xml
1356 1357
--[[ldx-- 1358<p>We cannot load an <l n='lpeg'/> from a filehandle so we need to load 1359the whole file first. The function accepts a string representing 1360a filename or a file handle.</p> 1361--ldx]]
--
1362 1363
function
xml
.
load
(
filename
,
settings
)
1364
local
data
=
"
"
1365
if
type
(
filename
)
=
=
"
string
"
then
1366
-- local data = io.loaddata(filename) -- todo: check type in io.loaddata
1367
local
f
=
io
.
open
(
filename
,
'
r
'
)
-- why not 'rb'
1368
if
f
then
1369
data
=
f
:
read
(
"
*all
"
)
-- io.readall(f) ... only makes sense for large files
1370
f
:
close
(
)
1371
end
1372
elseif
filename
then
-- filehandle
1373
data
=
filename
:
read
(
"
*all
"
)
-- io.readall(f) ... only makes sense for large files
1374
end
1375
if
settings
then
1376
settings
.
currentresource
=
filename
1377
local
result
=
xmlconvert
(
data
,
settings
)
1378
settings
.
currentresource
=
nil
1379
return
result
1380
else
1381
return
xmlconvert
(
data
,
{
currentresource
=
filename
}
)
1382
end
1383
end
1384 1385
--[[ldx-- 1386<p>When we inject new elements, we need to convert strings to 1387valid trees, which is what the next function does.</p> 1388--ldx]]
--
1389 1390
local
no_root
=
{
no_root
=
true
}
1391 1392
function
xml
.
toxml
(
data
)
1393
if
type
(
data
)
=
=
"
string
"
then
1394
local
root
=
{
xmlconvert
(
data
,
no_root
)
}
1395
return
(
#
root
>
1
and
root
)
or
root
[
1
]
1396
else
1397
return
data
1398
end
1399
end
1400 1401
--[[ldx-- 1402<p>For copying a tree we use a dedicated function instead of the 1403generic table copier. Since we know what we're dealing with we 1404can speed up things a bit. The second argument is not to be used!</p> 1405--ldx]]
--
1406 1407
-- local function copy(old)
1408
-- if old then
1409
-- local new = { }
1410
-- for k,v in next, old do
1411
-- if type(v) == "table" then
1412
-- new[k] = table.copy(v)
1413
-- else
1414
-- new[k] = v
1415
-- end
1416
-- end
1417
-- local mt = getmetatable(old)
1418
-- if mt then
1419
-- setmetatable(new,mt)
1420
-- end
1421
-- return new
1422
-- else
1423
-- return { }
1424
-- end
1425
-- end
1426
--
1427
-- We need to prevent __p__ recursio, so:
1428 1429
local
function
copy
(
old
,
p
)
1430
if
old
then
1431
local
new
=
{
}
1432
for
k
,
v
in
next
,
old
do
1433
local
t
=
type
(
v
)
=
=
"
table
"
1434
if
k
=
=
"
at
"
then
1435
local
t
=
{
}
1436
for
k
,
v
in
next
,
v
do
1437
t
[
k
]
=
v
1438
end
1439
new
[
k
]
=
t
1440
elseif
k
=
=
"
dt
"
then
1441
v
.
__p__
=
nil
1442
local
t
=
{
}
1443
for
i
=
1
,
#
v
do
1444
local
vi
=
v
[
i
]
1445
if
type
(
vi
)
=
=
"
table
"
then
1446
t
[
i
]
=
copy
(
vi
,
new
)
1447
else
1448
t
[
i
]
=
vi
1449
end
1450
end
1451
new
[
k
]
=
t
1452
t
.
__p__
=
p
1453
else
1454
new
[
k
]
=
v
-- so we also share entities, etc in root
1455
end
1456
end
1457
local
mt
=
getmetatable
(
old
)
1458
if
mt
then
1459
setmetatable
(
new
,
mt
)
1460
end
1461
return
new
1462
else
1463
return
{
}
1464
end
1465
end
1466 1467
xml
.
copy
=
copy
1468 1469
--[[ldx-- 1470<p>In <l n='context'/> serializing the tree or parts of the tree is a major 1471actitivity which is why the following function is pretty optimized resulting 1472in a few more lines of code than needed. The variant that uses the formatting 1473function for all components is about 15% slower than the concatinating 1474alternative.</p> 1475--ldx]]
--
1476 1477
-- todo: add <?xml version='1.0' standalone='yes'?> when not present
1478 1479
function
xml
.
checkbom
(
root
)
-- can be made faster
1480
if
root
.
ri
then
1481
local
dt
=
root
.
dt
1482
for
k
=
1
,
#
dt
do
1483
local
v
=
dt
[
k
]
1484
if
type
(
v
)
=
=
"
table
"
and
v
.
special
and
v
.
tg
=
=
"
@pi@
"
and
find
(
v
.
dt
[
1
]
,
"
xml.*version=
"
)
then
1485
return
1486
end
1487
end
1488
insert
(
dt
,
1
,
{
special
=
true
,
ns
=
"
"
,
tg
=
"
@pi@
"
,
dt
=
{
"
xml version='1.0' standalone='yes'
"
}
}
)
1489
insert
(
dt
,
2
,
"
\n
"
)
1490
end
1491
end
1492 1493
--[[ldx-- 1494<p>At the cost of some 25% runtime overhead you can first convert the tree to a string 1495and then handle the lot.</p> 1496--ldx]]
--
1497 1498
-- new experimental reorganized serialize
1499 1500
local
f_attribute
=
formatters
[
'
%s=%q
'
]
1501 1502
-- we could reuse ats .. for high performance we could also
1503
-- have a multiple handle calls instead of multiple arguments
1504
-- but it's not that critical
1505 1506
local
function
verbose_element
(
e
,
handlers
,
escape
)
-- options
1507
local
handle
=
handlers
.
handle
1508
local
serialize
=
handlers
.
serialize
1509
local
ens
,
etg
,
eat
,
edt
,
ern
=
e
.
ns
,
e
.
tg
,
e
.
at
,
e
.
dt
,
e
.
rn
1510
local
ats
=
eat
and
next
(
eat
)
and
{
}
1511
if
ats
then
1512
-- we now sort attributes
1513
local
n
=
0
1514
for
k
in
next
,
eat
do
1515
n
=
n
+
1
1516
ats
[
n
]
=
k
1517
end
1518
if
n
=
=
1
then
1519
local
k
=
ats
[
1
]
1520
ats
=
f_attribute
(
k
,
escaped
(
eat
[
k
]
)
)
1521
else
1522
sort
(
ats
)
1523
for
i
=
1
,
n
do
1524
local
k
=
ats
[
i
]
1525
ats
[
i
]
=
f_attribute
(
k
,
escaped
(
eat
[
k
]
)
)
1526
end
1527
ats
=
concat
(
ats
,
"
"
)
1528
end
1529
end
1530
if
ern
and
trace_entities
and
ern
~
=
ens
then
1531
ens
=
ern
1532
end
1533
local
n
=
edt
and
#
edt
1534
if
ens
~
=
"
"
then
1535
if
n
and
n
>
0
then
1536
if
ats
then
1537
handle
(
"
<
"
,
ens
,
"
:
"
,
etg
,
"
"
,
ats
,
"
>
"
)
1538
else
1539
handle
(
"
<
"
,
ens
,
"
:
"
,
etg
,
"
>
"
)
1540
end
1541
for
i
=
1
,
n
do
1542
local
e
=
edt
[
i
]
1543
if
type
(
e
)
=
=
"
string
"
then
1544
handle
(
escaped
(
e
)
)
1545
else
1546
serialize
(
e
,
handlers
)
1547
end
1548
end
1549
handle
(
"
</
"
,
ens
,
"
:
"
,
etg
,
"
>
"
)
1550
else
1551
if
ats
then
1552
handle
(
"
<
"
,
ens
,
"
:
"
,
etg
,
"
"
,
ats
,
"
/>
"
)
1553
else
1554
handle
(
"
<
"
,
ens
,
"
:
"
,
etg
,
"
/>
"
)
1555
end
1556
end
1557
else
1558
if
n
and
n
>
0
then
1559
if
ats
then
1560
handle
(
"
<
"
,
etg
,
"
"
,
ats
,
"
>
"
)
1561
else
1562
handle
(
"
<
"
,
etg
,
"
>
"
)
1563
end
1564
for
i
=
1
,
n
do
1565
local
e
=
edt
[
i
]
1566
if
type
(
e
)
=
=
"
string
"
then
1567
handle
(
escaped
(
e
)
)
-- option: hexify escaped entities
1568
else
1569
serialize
(
e
,
handlers
)
1570
end
1571
end
1572
handle
(
"
</
"
,
etg
,
"
>
"
)
1573
else
1574
if
ats
then
1575
handle
(
"
<
"
,
etg
,
"
"
,
ats
,
"
/>
"
)
1576
else
1577
handle
(
"
<
"
,
etg
,
"
/>
"
)
1578
end
1579
end
1580
end
1581
end
1582 1583
local
function
verbose_pi
(
e
,
handlers
)
1584
handlers
.
handle
(
"
<?
"
,
e
.
dt
[
1
]
,
"
?>
"
)
1585
end
1586 1587
local
function
verbose_comment
(
e
,
handlers
)
1588
handlers
.
handle
(
"
<!--
"
,
e
.
dt
[
1
]
,
"
-->
"
)
1589
end
1590 1591
local
function
verbose_cdata
(
e
,
handlers
)
1592
handlers
.
handle
(
"
<![CDATA[
"
,
e
.
dt
[
1
]
,
"
]]>
"
)
1593
end
1594 1595
local
function
verbose_doctype
(
e
,
handlers
)
1596
handlers
.
handle
(
"
<!DOCTYPE
"
,
e
.
dt
[
1
]
,
"
>
"
)
-- has space at end of string
1597
end
1598 1599
local
function
verbose_root
(
e
,
handlers
)
1600
handlers
.
serialize
(
e
.
dt
,
handlers
)
1601
end
1602 1603
local
function
verbose_text
(
e
,
handlers
)
1604
handlers
.
handle
(
escaped
(
e
)
)
1605
end
1606 1607
local
function
verbose_document
(
e
,
handlers
)
1608
local
serialize
=
handlers
.
serialize
1609
local
functions
=
handlers
.
functions
1610
for
i
=
1
,
#
e
do
1611
local
ei
=
e
[
i
]
1612
if
type
(
ei
)
=
=
"
string
"
then
1613
functions
[
"
@tx@
"
]
(
ei
,
handlers
)
1614
else
1615
serialize
(
ei
,
handlers
)
1616
end
1617
end
1618
end
1619 1620
local
function
serialize
(
e
,
handlers
,
...
)
1621
if
e
then
1622
local
initialize
=
handlers
.
initialize
1623
local
finalize
=
handlers
.
finalize
1624
local
functions
=
handlers
.
functions
1625
if
initialize
then
1626
local
state
=
initialize
(
...
)
1627
if
not
state
=
=
true
then
1628
return
state
1629
end
1630
end
1631
local
etg
=
e
.
tg
1632
if
etg
then
1633
(
functions
[
etg
]
or
functions
[
"
@el@
"
]
)
(
e
,
handlers
)
1634
-- elseif type(e) == "string" then
1635
-- functions["@tx@"](e,handlers)
1636
else
1637
functions
[
"
@dc@
"
]
(
e
,
handlers
)
-- dc ?
1638
end
1639
if
finalize
then
1640
return
finalize
(
)
1641
end
1642
end
1643
end
1644 1645
local
function
xserialize
(
e
,
handlers
)
1646
if
e
then
1647
local
functions
=
handlers
.
functions
1648
local
etg
=
e
.
tg
1649
if
etg
then
1650
(
functions
[
etg
]
or
functions
[
"
@el@
"
]
)
(
e
,
handlers
)
1651
-- elseif type(e) == "string" then
1652
-- functions["@tx@"](e,handlers)
1653
else
1654
functions
[
"
@dc@
"
]
(
e
,
handlers
)
1655
end
1656
end
1657
end
1658 1659
local
handlers
=
{
}
1660 1661
local
function
newhandlers
(
settings
)
1662
local
t
=
table
.
copy
(
handlers
[
settings
and
settings
.
parent
or
"
verbose
"
]
or
{
}
)
-- merge
1663
if
settings
then
1664
for
k
,
v
in
next
,
settings
do
1665
if
type
(
v
)
=
=
"
table
"
then
1666
local
tk
=
t
[
k
]
if
not
tk
then
tk
=
{
}
t
[
k
]
=
tk
end
1667
for
kk
,
vv
in
next
,
v
do
1668
tk
[
kk
]
=
vv
1669
end
1670
else
1671
t
[
k
]
=
v
1672
end
1673
end
1674
if
settings
.
name
then
1675
handlers
[
settings
.
name
]
=
t
1676
end
1677
end
1678
utilities
.
storage
.
mark
(
t
)
1679
return
t
1680
end
1681 1682
local
nofunction
=
function
(
)
end
1683 1684
function
xml
.
sethandlersfunction
(
handler
,
name
,
fnc
)
1685
handler
.
functions
[
name
]
=
fnc
or
nofunction
1686
end
1687 1688
function
xml
.
gethandlersfunction
(
handler
,
name
)
1689
return
handler
.
functions
[
name
]
1690
end
1691 1692
function
xml
.
gethandlers
(
name
)
1693
return
handlers
[
name
]
1694
end
1695 1696
newhandlers
{
1697
name
=
"
verbose
"
,
1698
initialize
=
false
,
-- faster than nil and mt lookup
1699
finalize
=
false
,
-- faster than nil and mt lookup
1700
serialize
=
xserialize
,
1701
handle
=
print
,
1702
functions
=
{
1703
[
"
@dc@
"
]
=
verbose_document
,
1704
[
"
@dt@
"
]
=
verbose_doctype
,
1705
[
"
@rt@
"
]
=
verbose_root
,
1706
[
"
@el@
"
]
=
verbose_element
,
1707
[
"
@pi@
"
]
=
verbose_pi
,
1708
[
"
@cm@
"
]
=
verbose_comment
,
1709
[
"
@cd@
"
]
=
verbose_cdata
,
1710
[
"
@tx@
"
]
=
verbose_text
,
1711
}
1712
}
1713 1714
--[[ldx-- 1715<p>How you deal with saving data depends on your preferences. For a 40 MB database 1716file the timing on a 2.3 Core Duo are as follows (time in seconds):</p> 1717 1718<lines> 17191.3 : load data from file to string 17206.1 : convert string into tree 17215.3 : saving in file using xmlsave 17226.8 : converting to string using xml.tostring 17233.6 : saving converted string in file 1724</lines> 1725 1726<p>Beware, these were timing with the old routine but measurements will not be that 1727much different I guess.</p> 1728--ldx]]
--
1729 1730
-- maybe this will move to lxml-xml
1731 1732
local
result
1733 1734
local
xmlfilehandler
=
newhandlers
{
1735
name
=
"
file
"
,
1736
initialize
=
function
(
name
)
1737
result
=
io
.
open
(
name
,
"
wb
"
)
1738
return
result
1739
end
,
1740
finalize
=
function
(
)
1741
result
:
close
(
)
1742
return
true
1743
end
,
1744
handle
=
function
(
...
)
1745
result
:
write
(
...
)
1746
end
,
1747
}
1748 1749
-- no checking on writeability here but not faster either
1750
--
1751
-- local xmlfilehandler = newhandlers {
1752
-- initialize = function(name)
1753
-- io.output(name,"wb")
1754
-- return true
1755
-- end,
1756
-- finalize = function()
1757
-- io.close()
1758
-- return true
1759
-- end,
1760
-- handle = io.write,
1761
-- }
1762 1763
function
xml
.
save
(
root
,
name
)
1764
serialize
(
root
,
xmlfilehandler
,
name
)
1765
end
1766 1767
-- local result
1768
--
1769
-- local xmlstringhandler = newhandlers {
1770
-- name = "string",
1771
-- initialize = function()
1772
-- result = { }
1773
-- return result
1774
-- end,
1775
-- finalize = function()
1776
-- return concat(result)
1777
-- end,
1778
-- handle = function(...)
1779
-- result[#result+1] = concat { ... }
1780
-- end,
1781
-- }
1782 1783
local
result
,
r
,
threshold
=
{
}
,
0
,
512
1784 1785
local
xmlstringhandler
=
newhandlers
{
1786
name
=
"
string
"
,
1787
initialize
=
function
(
)
1788
r
=
0
1789
return
result
1790
end
,
1791
finalize
=
function
(
)
1792
local
done
=
concat
(
result
,
"
"
,
1
,
r
)
1793
r
=
0
1794
if
r
>
threshold
then
1795
result
=
{
}
1796
end
1797
return
done
1798
end
,
1799
handle
=
function
(
...
)
1800
for
i
=
1
,
select
(
"
#
"
,
...
)
do
1801
r
=
r
+
1
1802
result
[
r
]
=
select
(
i
,
...
)
1803
end
1804
end
,
1805
}
1806 1807
local
function
xmltostring
(
root
)
-- 25% overhead due to collecting
1808
if
not
root
then
1809
return
"
"
1810
elseif
type
(
root
)
=
=
"
string
"
then
1811
return
root
1812
else
-- if next(root) then -- next is faster than type (and >0 test)
1813
return
serialize
(
root
,
xmlstringhandler
)
or
"
"
1814
end
1815
end
1816 1817
local
function
__tostring
(
root
)
-- inline
1818
return
(
root
and
xmltostring
(
root
)
)
or
"
"
1819
end
1820 1821
initialize_mt
=
function
(
root
)
-- redefinition
1822
mt
=
{
__tostring
=
__tostring
,
__index
=
root
}
1823
end
1824 1825
xml
.
defaulthandlers
=
handlers
1826
xml
.
newhandlers
=
newhandlers
1827
xml
.
serialize
=
serialize
1828
xml
.
tostring
=
xmltostring
1829 1830
--[[ldx-- 1831<p>The next function operated on the content only and needs a handle function 1832that accepts a string.</p> 1833--ldx]]
--
1834 1835
local
function
xmlstring
(
e
,
handle
)
1836
if
not
handle
or
(
e
.
special
and
e
.
tg
~
=
"
@rt@
"
)
then
1837
-- nothing
1838
elseif
e
.
tg
then
1839
local
edt
=
e
.
dt
1840
if
edt
then
1841
for
i
=
1
,
#
edt
do
1842
xmlstring
(
edt
[
i
]
,
handle
)
1843
end
1844
end
1845
else
1846
handle
(
e
)
1847
end
1848
end
1849 1850
xml
.
string
=
xmlstring
1851 1852
--[[ldx-- 1853<p>A few helpers:</p> 1854--ldx]]
--
1855 1856
--~ xmlsetproperty(root,"settings",settings)
1857 1858
function
xml
.
settings
(
e
)
1859
while
e
do
1860
local
s
=
e
.
settings
1861
if
s
then
1862
return
s
1863
else
1864
e
=
e
.
__p__
1865
end
1866
end
1867
return
nil
1868
end
1869 1870
function
xml
.
root
(
e
)
1871
local
r
=
e
1872
while
e
do
1873
e
=
e
.
__p__
1874
if
e
then
1875
r
=
e
1876
end
1877
end
1878
return
r
1879
end
1880 1881
function
xml
.
parent
(
root
)
1882
return
root
.
__p__
1883
end
1884 1885
function
xml
.
body
(
root
)
1886
return
root
.
ri
and
root
.
dt
[
root
.
ri
]
or
root
-- not ok yet
1887
end
1888 1889
function
xml
.
name
(
root
)
1890
if
not
root
then
1891
return
"
"
1892
end
1893
local
ns
=
root
.
ns
1894
local
tg
=
root
.
tg
1895
if
ns
=
=
"
"
then
1896
return
tg
1897
else
1898
return
ns
.
.
"
:
"
.
.
tg
1899
end
1900
end
1901 1902
--[[ldx-- 1903<p>The next helper erases an element but keeps the table as it is, 1904and since empty strings are not serialized (effectively) it does 1905not harm. Copying the table would take more time. Usage:</p> 1906--ldx]]
--
1907 1908
function
xml
.
erase
(
dt
,
k
)
1909
if
dt
then
1910
if
k
then
1911
dt
[
k
]
=
"
"
1912
else
for
k
=
1
,
#
dt
do
1913
dt
[
1
]
=
{
"
"
}
1914
end
end
1915
end
1916
end
1917 1918
--[[ldx-- 1919<p>The next helper assigns a tree (or string). Usage:</p> 1920 1921<typing> 1922dt[k] = xml.assign(root) or xml.assign(dt,k,root) 1923</typing> 1924--ldx]]
--
1925 1926
function
xml
.
assign
(
dt
,
k
,
root
)
1927
if
dt
and
k
then
1928
dt
[
k
]
=
type
(
root
)
=
=
"
table
"
and
xml
.
body
(
root
)
or
root
1929
return
dt
[
k
]
1930
else
1931
return
xml
.
body
(
root
)
1932
end
1933
end
1934 1935
-- the following helpers may move
1936 1937
--[[ldx-- 1938<p>The next helper assigns a tree (or string). Usage:</p> 1939<typing> 1940xml.tocdata(e) 1941xml.tocdata(e,"error") 1942</typing> 1943--ldx]]
--
1944 1945
function
xml
.
tocdata
(
e
,
wrapper
)
-- a few more in the aux module
1946
local
whatever
=
type
(
e
)
=
=
"
table
"
and
xmltostring
(
e
.
dt
)
or
e
or
"
"
1947
if
wrapper
then
1948
whatever
=
formatters
[
"
<%s>%s</%s>
"
]
(
wrapper
,
whatever
,
wrapper
)
1949
end
1950
local
t
=
{
special
=
true
,
ns
=
"
"
,
tg
=
"
@cd@
"
,
at
=
{
}
,
rn
=
"
"
,
dt
=
{
whatever
}
,
__p__
=
e
}
1951
setmetatable
(
t
,
getmetatable
(
e
)
)
1952
e
.
dt
=
{
t
}
1953
end
1954 1955
function
xml
.
makestandalone
(
root
)
1956
if
root
.
ri
then
1957
local
dt
=
root
.
dt
1958
for
k
=
1
,
#
dt
do
1959
local
v
=
dt
[
k
]
1960
if
type
(
v
)
=
=
"
table
"
and
v
.
special
and
v
.
tg
=
=
"
@pi@
"
then
1961
local
txt
=
v
.
dt
[
1
]
1962
if
find
(
txt
,
"
xml.*version=
"
)
then
1963
v
.
dt
[
1
]
=
txt
.
.
"
standalone='yes'
"
1964
break
1965
end
1966
end
1967
end
1968
end
1969
return
root
1970
end
1971 1972
function
xml
.
kind
(
e
)
1973
local
dt
=
e
and
e
.
dt
1974
if
dt
then
1975
local
n
=
#
dt
1976
if
n
=
=
1
then
1977
local
d
=
dt
[
1
]
1978
if
d
.
special
then
1979
local
tg
=
d
.
tg
1980
if
tg
=
=
"
@cd@
"
then
1981
return
"
cdata
"
1982
elseif
tg
=
=
"
@cm@
"
then
1983
return
"
comment
"
1984
elseif
tg
=
=
"
@pi@
"
then
1985
return
"
instruction
"
1986
elseif
tg
=
=
"
@dt@
"
then
1987
return
"
declaration
"
1988
end
1989
elseif
type
(
d
)
=
=
"
string
"
then
1990
return
"
text
"
1991
end
1992
return
"
element
"
1993
elseif
n
>
0
then
1994
return
"
mixed
"
1995
end
1996
end
1997
return
"
empty
"
1998
end
1999