lxml-tab.lua /size: 61 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['lxml-tab'] = {
2    version   = 1.001,
3    comment   = "this module is the basis for the lxml-* ones",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc
10-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the
11-- trouble
12
13-- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua)
14-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
15-- of work so we delay this till we cleanup
16
17local trace_entities = false  trackers.register("xml.entities", function(v) trace_entities = v end)
18
19local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end
20
21--[[ldx--
22<p>The parser used here is inspired by the variant discussed in the lua book, but
23handles comment and processing instructions, has a different structure, provides
24parent access; a first version used different trickery but was less optimized to we
25went this route. First we had a find based parser, now we have an <l n='lpeg'/> based one.
26The find based parser can be found in l-xml-edu.lua along with other older code.</p>
27--ldx]]--
28
29if lpeg.setmaxstack then lpeg.setmaxstack(1000) end -- deeply nested xml files
30
31xml = xml or { }
32local xml = xml
33
34--~ local xml = xml
35
36local concat, remove, insert = table.concat, table.remove, table.insert
37local type, next, setmetatable, getmetatable, tonumber, rawset, select = type, next, setmetatable, getmetatable, tonumber, rawset, select
38local lower, find, match, gsub = string.lower, string.find, string.match, string.gsub
39local sort = table.sort
40local utfchar = utf.char
41local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
42local P, S, R, C, V, C, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.C, lpeg.Cs
43local formatters = string.formatters
44
45--[[ldx--
46<p>First a hack to enable namespace resolving. A namespace is characterized by
47a <l n='url'/>. The following function associates a namespace prefix with a
48pattern. We use <l n='lpeg'/>, which in this case is more than twice as fast as a
49find based solution where we loop over an array of patterns. Less code and
50much cleaner.</p>
51--ldx]]--
52
53do -- begin of namespace closure (we ran out of locals)
54
55xml.xmlns = xml.xmlns or { }
56
57--[[ldx--
58<p>The next function associates a namespace prefix with an <l n='url'/>. This
59normally happens independent of parsing.</p>
60
61<typing>
62xml.registerns("mml","mathml")
63</typing>
64--ldx]]--
65
66local check = P(false)
67local parse = check
68
69function xml.registerns(namespace, pattern) -- pattern can be an lpeg
70    check = check + C(P(lower(pattern))) / namespace
71    parse = P { P(check) + 1 * V(1) }
72end
73
74--[[ldx--
75<p>The next function also registers a namespace, but this time we map a
76given namespace prefix onto a registered one, using the given
77<l n='url'/>. This used for attributes like <t>xmlns:m</t>.</p>
78
79<typing>
80xml.checkns("m","http://www.w3.org/mathml")
81</typing>
82--ldx]]--
83
84function xml.checkns(namespace,url)
85    local ns = lpegmatch(parse,lower(url))
86    if ns and namespace ~= ns then
87        xml.xmlns[namespace] = ns
88    end
89end
90
91--[[ldx--
92<p>Next we provide a way to turn an <l n='url'/> into a registered
93namespace. This used for the <t>xmlns</t> attribute.</p>
94
95<typing>
96resolvedns = xml.resolvens("http://www.w3.org/mathml")
97</typing>
98
99This returns <t>mml</t>.
100--ldx]]--
101
102function xml.resolvens(url)
103     return lpegmatch(parse,lower(url)) or ""
104end
105
106--[[ldx--
107<p>A namespace in an element can be remapped onto the registered
108one efficiently by using the <t>xml.xmlns</t> table.</p>
109--ldx]]--
110
111end -- end of namespace closure
112
113--[[ldx--
114<p>This version uses <l n='lpeg'/>. We follow the same approach as before, stack and top and
115such. This version is about twice as fast which is mostly due to the fact that
116we don't have to prepare the stream for cdata, doctype etc etc. This variant is
117is dedicated to Luigi Scarso, who challenged me with 40 megabyte <l n='xml'/> files that
118took 12.5 seconds to load (1.5 for file io and the rest for tree building). With
119the <l n='lpeg'/> implementation we got that down to less 7.3 seconds. Loading the 14
120<l n='context'/> interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.</p>
121
122<p>Next comes the parser. The rather messy doctype definition comes in many
123disguises so it is no surprice that later on have to dedicate quite some
124<l n='lpeg'/> code to it.</p>
125
126<typing>
127<!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
128<!DOCTYPE Something PUBLIC "... ..." "..." >
129<!DOCTYPE Something SYSTEM "... ..." [ ... ] >
130<!DOCTYPE Something SYSTEM "... ..." >
131<!DOCTYPE Something [ ... ] >
132<!DOCTYPE Something >
133</typing>
134
135<p>The code may look a bit complex but this is mostly due to the fact that we
136resolve namespaces and attach metatables. There is only one public function:</p>
137
138<typing>
139local x = xml.convert(somestring)
140</typing>
141
142<p>An optional second boolean argument tells this function not to create a root
143element.</p>
144
145<p>Valid entities are:</p>
146
147<typing>
148<!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
149<!ENTITY xxxx PUBLIC "yyyy" >
150<!ENTITY xxxx "yyyy" >
151</typing>
152--ldx]]--
153
154-- not just one big nested table capture (lpeg overflow)
155
156local nsremap, resolvens = xml.xmlns, xml.resolvens
157
158local stack, level, top, at, xmlnms, errorstr
159local entities, parameters
160local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined
161local dcache, hcache, acache
162local mt, dt, nt
163local currentfilename, currentline, linenumbers
164
165local grammar_parsed_text_one
166local grammar_parsed_text_two
167local grammar_unparsed_text
168
169local handle_hex_entity
170local handle_dec_entity
171local handle_any_entity_dtd
172local handle_any_entity_text
173
174local function preparexmlstate(settings)
175    if settings then
176        linenumbers        = settings.linenumbers
177        stack              = { }
178        level              = 0
179        top                = { }
180        at                 = { }
181        mt                 = { }
182        dt                 = { }
183        nt                 = 0   -- some 5% faster than #dt on cont-en.xml
184        xmlns              = { }
185        errorstr           = nil
186        strip              = settings.strip_cm_and_dt
187        utfize             = settings.utfize_entities
188        resolve            = settings.resolve_entities            -- enable this in order to apply the dtd
189        resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities
190        unify_predefined   = settings.unify_predefined_entities   -- &#038; -> &amp;
191        cleanup            = settings.text_cleanup
192        entities           = settings.entities or { }
193        currentfilename    = settings.currentresource
194        currentline        = 1
195        parameters         = { }
196        reported_at_errors = { }
197        dcache             = { }
198        hcache             = { }
199        acache             = { }
200        if utfize == nil then
201            settings.utfize_entities = true
202            utfize = true
203        end
204        if resolve_predefined == nil then
205            settings.resolve_predefined_entities = true
206            resolve_predefined = true
207        end
208    else
209        linenumbers        = false
210        stack              = nil
211        level              = nil
212        top                = nil
213        at                 = nil
214        mt                 = nil
215        dt                 = nil
216        nt                 = nil
217        xmlns              = nil
218        errorstr           = nil
219        strip              = nil
220        utfize             = nil
221        resolve            = nil
222        resolve_predefined = nil
223        unify_predefined   = nil
224        cleanup            = nil
225        entities           = nil
226        parameters         = nil
227        reported_at_errors = nil
228        dcache             = nil
229        hcache             = nil
230        acache             = nil
231        currentfilename    = nil
232        currentline        = 1
233    end
234end
235
236local function initialize_mt(root)
237    mt = { __index = root } -- will be redefined later
238end
239
240function xml.setproperty(root,k,v)
241    getmetatable(root).__index[k] = v
242end
243
244function xml.checkerror(top,toclose)
245    return "" -- can be set
246end
247
248local checkns = xml.checkns
249
250local function add_attribute(namespace,tag,value)
251    if cleanup and value ~= "" then
252        value = cleanup(value) -- new
253    end
254    if tag == "xmlns" then
255        xmlns[#xmlns+1] = resolvens(value)
256        at[tag] = value
257    elseif namespace == "" then
258        at[tag] = value
259    elseif namespace == "xmlns" then
260        checkns(tag,value)
261        at["xmlns:" .. tag] = value
262    else
263        -- for the moment this way:
264        at[namespace .. ":" .. tag] = value
265    end
266end
267
268local function add_empty(spacing, namespace, tag)
269    if spacing ~= "" then
270        nt = nt + 1
271        dt[nt] = spacing
272    end
273    local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
274    top = stack[level]
275    dt = top.dt
276    nt = #dt + 1
277    local t = linenumbers and {
278        ns = namespace or "",
279        rn = resolved,
280        tg = tag,
281        at = at,
282        dt = { },
283        ni = nt, -- set slot, needed for css filtering
284        cf = currentfilename,
285        cl = currentline,
286        __p__ = top,
287    } or {
288        ns = namespace or "",
289        rn = resolved,
290        tg = tag,
291        at = at,
292        dt = { },
293        ni = nt, -- set slot, needed for css filtering
294        __p__ = top,
295    }
296    dt[nt] = t
297    setmetatable(t, mt)
298    if at.xmlns then
299        remove(xmlns)
300    end
301    at = { }
302end
303
304local function add_begin(spacing, namespace, tag)
305    if spacing ~= "" then
306        nt = nt + 1
307        dt[nt] = spacing
308    end
309    local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
310    dt = { }
311    top = linenumbers and {
312        ns = namespace or "",
313        rn = resolved,
314        tg = tag,
315        at = at,
316        dt = dt,
317        ni = nil, -- preset slot, needed for css filtering
318        cf = currentfilename,
319        cl = currentline,
320        __p__ = stack[level],
321    } or {
322        ns = namespace or "",
323        rn = resolved,
324        tg = tag,
325        at = at,
326        dt = dt,
327        ni = nil, -- preset slot, needed for css filtering
328        __p__ = stack[level],
329    }
330    setmetatable(top, mt)
331    nt = 0
332    level = level + 1
333    stack[level] = top
334    at = { }
335end
336
337local function add_end(spacing, namespace, tag)
338    if spacing ~= "" then
339        nt = nt + 1
340        dt[nt] = spacing
341    end
342    local toclose = stack[level]
343    level = level - 1
344    top = stack[level]
345    if level < 1 then
346        errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "")
347        report_xml(errorstr)
348    elseif toclose.tg ~= tag then -- no namespace check
349        errorstr = formatters["unable to close %s with %s %s"](toclose.tg,tag,xml.checkerror(top,toclose) or "")
350        report_xml(errorstr)
351    end
352    dt = top.dt
353    nt = #dt + 1
354    dt[nt] = toclose
355    toclose.ni = nt -- update slot, needed for css filtering
356    if toclose.at.xmlns then
357        remove(xmlns)
358    end
359end
360
361-- local spaceonly = lpegpatterns.whitespace^0 * P(-1)
362--
363-- will be an option: dataonly
364--
365-- if #text == 0 or     lpegmatch(spaceonly,text) then
366--     return
367-- end
368
369local function add_text(text)
370    if text == "" then
371        return
372    elseif cleanup then
373        if nt > 0 then
374            local s = dt[nt]
375            if type(s) == "string" then
376                dt[nt] = s .. cleanup(text)
377            else
378                nt = nt + 1
379                dt[nt] = cleanup(text)
380            end
381        else
382            nt = 1
383            dt[1] = cleanup(text)
384        end
385    else
386        if nt > 0 then
387            local s = dt[nt]
388            if type(s) == "string" then
389                dt[nt] = s .. text
390            else
391                nt = nt + 1
392                dt[nt] = text
393            end
394        else
395            nt = 1
396            dt[1] = text
397        end
398    end
399end
400
401local function add_special(what, spacing, text)
402    if spacing ~= "" then
403        nt = nt + 1
404        dt[nt] = spacing
405    end
406    if strip and (what == "@cm@" or what == "@dt@") then
407        -- forget it
408    else
409        nt = nt + 1
410        dt[nt] = linenumbers and {
411            special = true,
412            ns      = "",
413            tg      = what,
414            ni      = nil, -- preset slot
415            dt      = { text },
416            cf      = currentfilename,
417            cl      = currentline,
418        } or {
419            special = true,
420            ns      = "",
421            tg      = what,
422            ni      = nil, -- preset slot
423            dt      = { text },
424        }
425    end
426end
427
428local function set_message(txt)
429    errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","")
430end
431
432local function attribute_value_error(str)
433    if not reported_at_errors[str] then
434        report_xml("invalid attribute value %a",str)
435        reported_at_errors[str] = true
436        at._error_ = str
437    end
438    return str
439end
440
441local function attribute_specification_error(str)
442    if not reported_at_errors[str] then
443        report_xml("invalid attribute specification %a",str)
444        reported_at_errors[str] = true
445        at._error_ = str
446    end
447    return str
448end
449
450-- I'm sure that this lpeg can be simplified (less captures) but it evolved ...
451-- so i'm not going to change it now.
452
453do
454
455    -- In order to overcome lua limitations we wrap entity stuff in a closure.
456
457    local badentity = "&" -- was "&error;"
458
459    xml.placeholders = {
460        unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end,
461        unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end,
462        unknown_any_entity = function(str) return formatters["&#x%s;"](str) end,
463    }
464
465    local function fromhex(s)
466        local n = tonumber(s,16)
467        if n then
468            return utfchar(n)
469        else
470            return formatters["h:%s"](s), true
471        end
472    end
473
474    local function fromdec(s)
475        local n = tonumber(s)
476        if n then
477            return utfchar(n)
478        else
479            return formatters["d:%s"](s), true
480        end
481    end
482
483    local p_rest = (1-P(";"))^0
484    local p_many = P(1)^0
485
486    local parsedentity =
487        P("&#") * (P("x")*(p_rest/fromhex) + (p_rest/fromdec)) * P(";") * P(-1) +
488        P ("#") * (P("x")*(p_many/fromhex) + (p_many/fromdec))
489
490    xml.parsedentitylpeg = parsedentity
491
492    -- parsing in the xml file
493
494    local predefined_unified = {
495        [38] = "&amp;",
496        [42] = "&quot;",
497        [47] = "&apos;",
498        [74] = "&lt;",
499        [76] = "&gt;",
500    }
501
502    local predefined_simplified = {
503        [38] = "&", amp  = "&",
504        [42] = '"', quot = '"',
505        [47] = "'", apos = "'",
506        [74] = "<", lt   = "<",
507        [76] = ">", gt   = ">",
508    }
509
510    local nofprivates = 0xF0000 -- shared but seldom used
511
512    local privates_u = { -- unescaped
513        [ [[&]] ] = "&amp;",
514        [ [["]] ] = "&quot;",
515        [ [[']] ] = "&apos;",
516        [ [[<]] ] = "&lt;",
517        [ [[>]] ] = "&gt;",
518    }
519
520    local privates_p = { -- needed for roundtrip as well as serialize to tex
521    }
522
523    local privates_s = { -- for tex
524        [ [["]] ] = "&U+22;",
525        [ [[#]] ] = "&U+23;",
526        [ [[$]] ] = "&U+24;",
527        [ [[%]] ] = "&U+25;",
528        [ [[&]] ] = "&U+26;",
529        [ [[']] ] = "&U+27;",
530        [ [[<]] ] = "&U+3C;",
531        [ [[>]] ] = "&U+3E;",
532        [ [[\]] ] = "&U+5C;",
533        [ [[{]] ] = "&U+7B;",
534        [ [[|]] ] = "&U+7C;",
535        [ [[}]] ] = "&U+7D;",
536        [ [[~]] ] = "&U+7E;",
537    }
538
539    local privates_x = { -- for xml
540        [ [["]] ] = "&U+22;",
541        [ [[#]] ] = "&U+23;",
542        [ [[$]] ] = "&U+24;",
543        [ [[%]] ] = "&U+25;",
544        [ [[']] ] = "&U+27;",
545        [ [[\]] ] = "&U+5C;",
546        [ [[{]] ] = "&U+7B;",
547        [ [[|]] ] = "&U+7C;",
548        [ [[}]] ] = "&U+7D;",
549        [ [[~]] ] = "&U+7E;",
550    }
551
552    local privates_n = { -- keeps track of defined ones
553    }
554
555    local escaped       = utf.remapper(privates_u,"dynamic")
556    local unprivatized  = utf.remapper(privates_p,"dynamic")
557    local unspecialized = utf.remapper(privates_s,"dynamic")
558    local despecialized = utf.remapper(privates_x,"dynamic")
559
560    xml.unprivatized  = unprivatized
561    xml.unspecialized = unspecialized
562    xml.despecialized = despecialized
563    xml.escaped       = escaped
564
565    local function unescaped(s)
566        local p = privates_n[s]
567        if not p then
568            nofprivates = nofprivates + 1
569            p = utfchar(nofprivates)
570            privates_n[s] = p
571            s = "&" .. s .. ";" -- todo: use char-ent to map to hex
572            privates_u[p] = s
573            privates_p[p] = s
574            privates_s[p] = s
575        end
576        return p
577    end
578
579    xml.privatetoken = unescaped
580    xml.privatecodes = privates_n
581    xml.specialcodes = privates_s
582
583    function xml.addspecialcode(key,value)
584        privates_s[key] = value or "&" .. s .. ";"
585    end
586
587    handle_hex_entity = function(str)
588        local h = hcache[str]
589        if not h then
590            local n = tonumber(str,16)
591            h = unify_predefined and predefined_unified[n]
592            if h then
593                if trace_entities then
594                    report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
595                end
596            elseif utfize then
597                h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
598                if not n then
599                    report_xml("utfize, ignoring hex entity &#x%s;",str)
600                elseif trace_entities then
601                    report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
602                end
603            else
604                if trace_entities then
605                    report_xml("found entity &#x%s;",str)
606                end
607                h = "&#x" .. str .. ";"
608            end
609            hcache[str] = h
610        end
611        return h
612    end
613
614    handle_dec_entity = function(str)
615        local d = dcache[str]
616        if not d then
617            local n = tonumber(str)
618            d = unify_predefined and predefined_unified[n]
619            if d then
620                if trace_entities then
621                    report_xml("utfize, converting dec entity &#%s; into %a",str,d)
622                end
623            elseif utfize then
624                d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
625                if not n then
626                    report_xml("utfize, ignoring dec entity &#%s;",str)
627                elseif trace_entities then
628                    report_xml("utfize, converting dec entity &#%s; into %a",str,d)
629                end
630            else
631                if trace_entities then
632                    report_xml("found entity &#%s;",str)
633                end
634                d = "&#" .. str .. ";"
635            end
636            dcache[str] = d
637        end
638        return d
639    end
640
641    handle_any_entity_dtd = function(str)
642        if resolve then
643            local a = resolve_predefined and predefined_simplified[str] -- true by default
644            if a then
645                if trace_entities then
646                    report_xml("resolving entity &%s; to predefined %a",str,a)
647                end
648            else
649                if type(resolve) == "function" then
650                    a = resolve(str,entities) or entities[str]
651                else
652                    a = entities[str]
653                end
654                if a then
655                    if type(a) == "function" then
656                        if trace_entities then
657                            report_xml("expanding entity &%s; to function call",str)
658                        end
659                        a = a(str) or ""
660                    end
661                    a = lpegmatch(parsedentity,a) or a -- for nested
662                    if trace_entities then
663                        report_xml("resolving entity &%s; to internal %a",str,a)
664                    end
665                else
666                    local unknown_any_entity = placeholders.unknown_any_entity
667                    if unknown_any_entity then
668                        a = unknown_any_entity(str) or ""
669                    end
670                    if a then
671                        if trace_entities then
672                            report_xml("resolving entity &%s; to external %s",str,a)
673                        end
674                    else
675                        if trace_entities then
676                            report_xml("keeping entity &%s;",str)
677                        end
678                        if str == "" then
679                            a = badentity
680                        else
681                            a = "&" .. str .. ";"
682                        end
683                    end
684                end
685            end
686            return a
687        else
688            local a = acache[str]
689            if not a then
690                a = resolve_predefined and predefined_simplified[str]
691                if a then
692                    -- one of the predefined
693                    acache[str] = a
694                    if trace_entities then
695                        report_xml("entity &%s; becomes %a",str,a)
696                    end
697                elseif str == "" then
698                    if trace_entities then
699                        report_xml("invalid entity &%s;",str)
700                    end
701                    a = badentity
702                    acache[str] = a
703                else
704                    if trace_entities then
705                        report_xml("entity &%s; is made private",str)
706                    end
707                 -- a = "&" .. str .. ";"
708                    a = unescaped(str)
709                    acache[str] = a
710                end
711            end
712            return a
713        end
714    end
715
716    handle_any_entity_text = function(str)
717        if resolve then
718            local a = resolve_predefined and predefined_simplified[str]
719            if a then
720                if trace_entities then
721                    report_xml("resolving entity &%s; to predefined %a",str,a)
722                end
723            else
724                if type(resolve) == "function" then
725                    a = resolve(str,entities) or entities[str]
726                else
727                    a = entities[str]
728                end
729                if a then
730                    if type(a) == "function" then
731                        if trace_entities then
732                            report_xml("expanding entity &%s; to function call",str)
733                        end
734                        a = a(str) or ""
735                    end
736                    a = lpegmatch(grammar_parsed_text_two,a) or a
737                    if type(a) == "number" then
738                        return ""
739                    else
740                        a = lpegmatch(parsedentity,a) or a -- for nested
741                        if trace_entities then
742                            report_xml("resolving entity &%s; to internal %a",str,a)
743                        end
744                    end
745                    if trace_entities then
746                        report_xml("resolving entity &%s; to internal %a",str,a)
747                    end
748                else
749                    local unknown_any_entity = placeholders.unknown_any_entity
750                    if unknown_any_entity then
751                        a = unknown_any_entity(str) or ""
752                    end
753                    if a then
754                        if trace_entities then
755                            report_xml("resolving entity &%s; to external %s",str,a)
756                        end
757                    else
758                        if trace_entities then
759                            report_xml("keeping entity &%s;",str)
760                        end
761                        if str == "" then
762                            a = badentity
763                        else
764                            a = "&" .. str .. ";"
765                        end
766                    end
767                end
768            end
769            return a
770        else
771            local a = acache[str]
772            if not a then
773                a = resolve_predefined and predefined_simplified[str]
774                if a then
775                    -- one of the predefined
776                    acache[str] = a
777                    if trace_entities then
778                        report_xml("entity &%s; becomes %a",str,a)
779                    end
780                elseif str == "" then
781                    if trace_entities then
782                        report_xml("invalid entity &%s;",str)
783                    end
784                    a = badentity
785                    acache[str] = a
786                else
787                    if trace_entities then
788                        report_xml("entity &%s; is made private",str)
789                    end
790                 -- a = "&" .. str .. ";"
791                    a = unescaped(str)
792                    acache[str] = a
793                end
794            end
795            return a
796        end
797    end
798
799    -- for tex
800
801    local p_rest = (1-P(";"))^1
802
803    local spec = {
804        [0x23] = "\\Ux{23}", -- #
805        [0x24] = "\\Ux{24}", -- $
806        [0x25] = "\\Ux{25}", -- %
807        [0x5C] = "\\Ux{5C}", -- \
808        [0x7B] = "\\Ux{7B}", -- {
809        [0x7C] = "\\Ux{7C}", -- |
810        [0x7D] = "\\Ux{7D}", -- }
811        [0x7E] = "\\Ux{7E}", -- ~
812    }
813
814    local hash = table.setmetatableindex(spec,function(t,k)
815        local v = utfchar(k)
816        t[k] = v
817        return v
818    end)
819
820    local function fromuni(s)
821        local n = tonumber(s,16)
822        if n then
823            return hash[n]
824        else
825            return formatters["u:%s"](s), true
826        end
827    end
828
829    local function fromhex(s)
830        local n = tonumber(s,16)
831        if n then
832            return hash[n]
833        else
834            return formatters["h:%s"](s), true
835        end
836    end
837
838    local function fromdec(s)
839        local n = tonumber(s)
840        if n then
841            return hash[n]
842        else
843            return formatters["d:%s"](s), true
844        end
845    end
846
847    local reparsedentity =
848        P("U+") * (p_rest/fromuni)
849      + P("#")  * (
850            P("x") * (p_rest/fromhex)
851          + p_rest/fromdec
852        )
853
854    local hash = table.setmetatableindex(function(t,k)
855        local v = utfchar(k)
856        t[k] = v
857        return v
858    end)
859
860    local function fromuni(s)
861        local n = tonumber(s,16)
862        if n then
863            return hash[n]
864        else
865            return formatters["u:%s"](s), true
866        end
867    end
868
869    local function fromhex(s)
870        local n = tonumber(s,16)
871        if n then
872            return hash[n]
873        else
874            return formatters["h:%s"](s), true
875        end
876    end
877
878    local function fromdec(s)
879        local n = tonumber(s)
880        if n then
881            return hash[n]
882        else
883            return formatters["d:%s"](s), true
884        end
885    end
886
887    local unescapedentity =
888        P("U+") * (p_rest/fromuni)
889      + P("#")  * (
890            P("x") * (p_rest/fromhex)
891          + p_rest/fromdec
892        )
893
894    xml.reparsedentitylpeg  = reparsedentity   -- with \Ux{...} for special tex entities
895    xml.unescapedentitylpeg = unescapedentity  -- normal characters
896
897end
898
899-- we use these later on
900
901local escaped      = xml.escaped
902local unescaped    = xml.unescaped
903local placeholders = xml.placeholders
904
905--
906
907local function handle_end_entity(str)
908    report_xml("error in entity, %a found without ending %a",str,";")
909    return str
910end
911
912local function handle_crap_error(chr)
913    report_xml("error in parsing, unexpected %a found ",chr)
914    add_text(chr)
915    return chr
916end
917
918local function handlenewline()
919    currentline = currentline + 1
920end
921
922-- first = ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#x00F8-#x02FF] |
923--         [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
924--         [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
925--         [#x10000-#xEFFFF]
926-- rest  = "-" | "." | [0-9] | #xB7 | [#x300-#x36F] | [#x203F-#x2040]
927-- name  = first + (first + rest)^1
928--
929-- We assume utf and do no real checking!
930
931local spacetab         = S(' \t')
932local space            = S(' \r\n\t')
933local newline          = lpegpatterns.newline / handlenewline
934local anything         = P(1)
935local open             = P('<')
936local close            = P('>')
937local squote           = S("'")
938local dquote           = S('"')
939local equal            = P('=')
940local slash            = P('/')
941local colon            = P(':')
942local semicolon        = P(';')
943local ampersand        = P('&')
944----- valid_0          = lpegpatterns.utf8two + lpegpatterns.utf8three + lpegpatterns.utf8four
945local valid_0          = R("\128\255") -- basically any encoding without checking (fast)
946local valid_1          = R('az', 'AZ') + S('_') + valid_0
947local valid_2          = valid_1 + R('09') + S('-.')
948local valid            = valid_1 * valid_2^0
949local name_yes         = C(valid^1) * colon * C(valid^1)
950local name_nop         = C(P(true)) * C(valid^1)
951local name             = name_yes + name_nop
952local utfbom           = lpegpatterns.utfbom -- no capture
953local spacing          = C(space^0)
954
955local space_nl         = spacetab + newline
956local spacing_nl       = Cs((space_nl)^0)
957local anything_nl      = newline + P(1)
958
959local function weirdentity(k,v)
960    if trace_entities then
961        report_xml("registering %s entity %a as %a","weird",k,v)
962    end
963    parameters[k] = v
964end
965local function normalentity(k,v)
966    if trace_entities then
967        report_xml("registering %s entity %a as %a","normal",k,v)
968    end
969    entities[k] = v
970end
971local function systementity(k,v,n)
972    if trace_entities then
973        report_xml("registering %s entity %a as %a","system",k,v)
974    end
975    entities[k] = v
976end
977local function publicentity(k,v,n)
978    if trace_entities then
979        report_xml("registering %s entity %a as %a","public",k,v)
980    end
981    entities[k] = v
982end
983local function entityfile(pattern,k,v,n)
984    if n then
985        local okay, data
986        local loadbinfile = resolvers and resolvers.loadbinfile
987        if loadbinfile then
988            okay, data = loadbinfile(n)
989        else
990            data = io.loaddata(n)
991            okay = data and data ~= ""
992        end
993        if okay then
994            if trace_entities then
995                report_xml("loading public entities %a as %a from %a",k,v,n)
996            end
997            lpegmatch(pattern,data)
998            return
999        end
1000    end
1001    report_xml("ignoring public entities %a as %a from %a",k,v,n)
1002end
1003
1004local function install(spacenewline,spacing,anything)
1005
1006    local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0
1007    local hexentitycontent = R("AF","af","09")^1
1008    local decentitycontent = R("09")^1
1009    local parsedentity     = P("#")/"" * (
1010                                    P("x")/"" * (hexentitycontent/handle_hex_entity) +
1011                                                (decentitycontent/handle_dec_entity)
1012                                ) +             (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true)
1013    local parsedentity_text= P("#")/"" * (
1014                                    P("x")/"" * (hexentitycontent/handle_hex_entity) +
1015                                                (decentitycontent/handle_dec_entity)
1016                                ) +             (anyentitycontent/handle_any_entity_text) -- can be Cc(false)
1017    local entity           = (ampersand/"") * parsedentity   * (semicolon/"")
1018                           + ampersand * (anyentitycontent / handle_end_entity)
1019    local entity_text      = (ampersand/"") * parsedentity_text * (semicolon/"")
1020                           + ampersand * (anyentitycontent / handle_end_entity)
1021
1022    local text_unparsed    = Cs((anything-open)^1)
1023    local text_parsed      = (Cs((anything-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1
1024--     local text_parsed      = ((Cs(((anything-open-ampersand)^1) + entity_text))/add_text)^1
1025
1026    local somespace        = (spacenewline)^1
1027    local optionalspace    = (spacenewline)^0
1028
1029    local value            = (squote * Cs((entity + (anything - squote))^0) * squote) + (dquote * Cs((entity + (anything - dquote))^0) * dquote) -- ampersand and < also invalid in value
1030
1031    local endofattributes  = slash * close + close -- recovery of flacky html
1032    local whatever         = space * name * optionalspace * equal
1033    local wrongvalue       = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
1034
1035    local attributevalue   = value + wrongvalue
1036
1037    local attribute        = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
1038
1039 -- local attributes       = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
1040    local attributes       = (attribute + somespace^-1 * (((anything-endofattributes)^1)/attribute_specification_error))^0
1041
1042    local parsedtext       = text_parsed   -- / add_text
1043    local unparsedtext     = text_unparsed / add_text
1044    local balanced         = P { "[" * ((anything - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
1045
1046    -- todo: combine empty and begin so that we scan attributes only once .. maybe also go for match time captures
1047
1048    local emptyelement     = (spacing * open         * name * attributes * optionalspace * slash * close) / add_empty
1049    local beginelement     = (spacing * open         * name * attributes * optionalspace         * close) / add_begin
1050    local endelement       = (spacing * open * slash * name              * optionalspace         * close) / add_end
1051
1052--     local commonelement    =  spacing * open         * name * attributes * optionalspace *
1053--     local cemptyelement    = (slash * close) / add_empty
1054--     local cbeginelement    = (      * close) / add_begin
1055
1056    -- todo: combine the opens in:
1057
1058    local begincomment     = open * P("!--")
1059    local endcomment       = P("--") * close
1060    local begininstruction = open * P("?")
1061    local endinstruction   = P("?") * close
1062    local begincdata       = open * P("![CDATA[")
1063    local endcdata         = P("]]") * close
1064
1065    local someinstruction  = C((anything - endinstruction)^0)
1066    local somecomment      = C((anything - endcomment    )^0)
1067    local somecdata        = C((anything - endcdata      )^0)
1068
1069    -- todo: separate dtd parser
1070
1071    local begindoctype     = open * P("!DOCTYPE")
1072    local enddoctype       = close
1073    local beginset         = P("[")
1074    local endset           = P("]")
1075    local wrdtypename      = C((anything-somespace-P(";"))^1)
1076    local doctypename      = C((anything-somespace-close)^0)
1077    local elementdoctype   = optionalspace * P("<!ELEMENT") * (anything-close)^0 * close
1078
1079    local basiccomment     = begincomment * ((anything - endcomment)^0) * endcomment
1080
1081    local weirdentitytype  = P("%") * (somespace * doctypename * somespace * value) / weirdentity
1082    local normalentitytype = (doctypename * somespace * value) / normalentity
1083    local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value) / publicentity
1084
1085    local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
1086    local entitydoctype    = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close
1087
1088    local publicentityfile = (doctypename * somespace * P("PUBLIC") * somespace * value * (somespace * value)^0) / function(...)
1089        entityfile(entitydoctype,...)
1090    end
1091
1092    local function weirdresolve(s)
1093        lpegmatch(entitydoctype,parameters[s])
1094    end
1095
1096    local function normalresolve(s)
1097        lpegmatch(entitydoctype,entities[s])
1098    end
1099
1100    local entityresolve    = P("%") * (wrdtypename/weirdresolve ) * P(";")
1101                           + P("&") * (wrdtypename/normalresolve) * P(";")
1102
1103    entitydoctype          = entitydoctype + entityresolve
1104
1105    -- we accept comments in doctypes
1106
1107    local doctypeset       = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset
1108    local definitiondoctype= doctypename * somespace * doctypeset
1109    local publicdoctype    = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
1110    local systemdoctype    = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
1111    local simpledoctype    = (anything-close)^1 -- * balanced^0
1112    local somedoctype      = C((somespace * (publicentityfile + publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
1113
1114    local instruction      = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end
1115    local comment          = (spacing * begincomment     * somecomment     * endcomment    ) / function(...) add_special("@cm@",...) end
1116    local cdata            = (spacing * begincdata       * somecdata       * endcdata      ) / function(...) add_special("@cd@",...) end
1117    local doctype          = (spacing * begindoctype     * somedoctype     * enddoctype    ) / function(...) add_special("@dt@",...) end
1118
1119    local crap_parsed     = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand
1120    local crap_unparsed   = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata
1121
1122    local parsedcrap      = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
1123    local parsedcrap      = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
1124    local unparsedcrap    = Cs((crap_unparsed              )^1) / handle_crap_error
1125
1126    --  nicer but slower:
1127    --
1128    --  local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
1129    --  local comment     = (Cc("@cm@") * spacing * begincomment     * somecomment     * endcomment    ) / add_special
1130    --  local cdata       = (Cc("@cd@") * spacing * begincdata       * somecdata       * endcdata      ) / add_special
1131    --  local doctype     = (Cc("@dt@") * spacing * begindoctype     * somedoctype     * enddoctype    ) / add_special
1132
1133    local trailer = space^0 * (text_unparsed/set_message)^0
1134
1135    --  comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
1136    --  text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
1137    --  text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
1138
1139    -- local grammar_parsed_text = P { "preamble",
1140    --     preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
1141    --     parent   = beginelement * V("children")^0 * endelement,
1142    --     children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
1143    -- }
1144
1145    local grammar_parsed_text_one = P { "preamble",
1146        preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0,
1147    }
1148
1149    local grammar_parsed_text_two = P { "followup",
1150        followup = V("parent") * trailer,
1151        parent   = beginelement * V("children")^0 * endelement,
1152        children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
1153    }
1154
1155--     local grammar_parsed_text_two = P { "followup",
1156--         followup = beginelement * V("children")^0 * endelement * trailer,
1157--         children = parsedtext + beginelement * V("children")^0 * endelement + emptyelement + comment + cdata + instruction + parsedcrap,
1158--     }
1159
1160-- local grammar_parsed_text_two = P { "followup",
1161--     followup = commonelement * cbeginelement * V("children")^0 * endelement * trailer,
1162--     children = parsedtext + commonelement * (cbeginelement * V("children")^0 * endelement + cemptyelement) + comment + cdata + instruction + parsedcrap,
1163-- }
1164
1165    local grammar_unparsed_text = P { "preamble",
1166        preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
1167        parent   = beginelement * V("children")^0 * endelement,
1168        children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap,
1169    }
1170
1171    return grammar_parsed_text_one, grammar_parsed_text_two, grammar_unparsed_text
1172
1173end
1174
1175local
1176    grammar_parsed_text_one_nop ,
1177    grammar_parsed_text_two_nop ,
1178    grammar_unparsed_text_nop   = install(space, spacing, anything)
1179
1180local
1181    grammar_parsed_text_one_yes ,
1182    grammar_parsed_text_two_yes ,
1183    grammar_unparsed_text_yes   = install(space_nl, spacing_nl, anything_nl)
1184
1185-- maybe we will add settings to result as well
1186
1187local function _xmlconvert_(data,settings,detail)
1188    settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
1189    preparexmlstate(settings)
1190    if settings.linenumbers then
1191        grammar_parsed_text_one = grammar_parsed_text_one_yes
1192        grammar_parsed_text_two = grammar_parsed_text_two_yes
1193        grammar_unparsed_text   = grammar_unparsed_text_yes
1194    else
1195        grammar_parsed_text_one = grammar_parsed_text_one_nop
1196        grammar_parsed_text_two = grammar_parsed_text_two_nop
1197        grammar_unparsed_text   = grammar_unparsed_text_nop
1198    end
1199    local preprocessor = settings.preprocessor
1200    if data and data ~= "" and type(preprocessor) == "function" then
1201        data = preprocessor(data,settings) or data -- settings.currentresource
1202    end
1203    if settings.parent_root then
1204        mt = getmetatable(settings.parent_root)
1205    else
1206        initialize_mt(top)
1207    end
1208    level = level + 1
1209    stack[level] = top
1210    top.dt = { }
1211    dt = top.dt
1212    nt = 0
1213    if not data or data == "" then
1214        errorstr = "empty xml file"
1215    elseif data == true then
1216        errorstr = detail or "problematic xml file"
1217    elseif utfize or resolve then
1218        local m = lpegmatch(grammar_parsed_text_one,data)
1219        if m then
1220            m = lpegmatch(grammar_parsed_text_two,data,m)
1221        end
1222     -- local m = lpegmatch(grammar_parsed_text,data)
1223        if m then
1224         -- errorstr = "" can be set!
1225        else
1226            errorstr = "invalid xml file - parsed text"
1227        end
1228    elseif type(data) == "string" then
1229        if lpegmatch(grammar_unparsed_text,data) then
1230            errorstr = ""
1231        else
1232            errorstr = "invalid xml file - unparsed text"
1233        end
1234    else
1235        errorstr = "invalid xml file - no text at all"
1236    end
1237    local result
1238    if errorstr and errorstr ~= "" then
1239        result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at = { }, er = true } } }
1240        setmetatable(result, mt)
1241        setmetatable(result.dt[1], mt)
1242        setmetatable(stack, mt)
1243        local errorhandler = settings.error_handler
1244        if errorhandler == false then
1245            -- no error message
1246        else
1247            errorhandler = errorhandler or xml.errorhandler
1248            if errorhandler then
1249                local currentresource = settings.currentresource
1250                if currentresource and currentresource ~= "" then
1251                    xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr),currentresource)
1252                else
1253                    xml.errorhandler(formatters["load error: %s"](errorstr))
1254                end
1255            end
1256        end
1257    else
1258        result = stack[1]
1259    end
1260    if not settings.no_root then
1261        result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={ }, entities = entities, settings = settings }
1262        setmetatable(result, mt)
1263        local rdt = result.dt
1264        for k=1,#rdt do
1265            local v = rdt[k]
1266            if type(v) == "table" and not v.special then -- always table -)
1267                result.ri = k -- rootindex
1268                v.__p__ = result  -- new, experiment, else we cannot go back to settings, we need to test this !
1269                break
1270            end
1271        end
1272    end
1273    if errorstr and errorstr ~= "" then
1274        result.error = true
1275    else
1276        errorstr = nil
1277    end
1278    result.statistics = {
1279        errormessage = errorstr,
1280        entities = {
1281            decimals      = dcache,
1282            hexadecimals  = hcache,
1283            names         = acache,
1284            intermediates = parameters,
1285        }
1286    }
1287    preparexmlstate() -- resets
1288    return result
1289end
1290
1291-- Because we can have a crash (stack issues) with faulty xml, we wrap this one
1292-- in a protector:
1293
1294local function xmlconvert(data,settings)
1295    local ok, result = pcall(function() return _xmlconvert_(data,settings) end)
1296    if ok then
1297        return result
1298    elseif type(result) == "string" then
1299        return _xmlconvert_(true,settings,result)
1300    else
1301        return _xmlconvert_(true,settings)
1302    end
1303end
1304
1305xml.convert = xmlconvert
1306
1307function xml.inheritedconvert(data,xmldata,cleanup) -- xmldata is parent
1308    local settings = xmldata.settings
1309    if settings then
1310        settings.parent_root = xmldata -- to be tested
1311    end
1312 -- settings.no_root = true
1313    local xc = xmlconvert(data,settings) -- hm, we might need to locate settings
1314    if cleanup then
1315        local x = xc.dt
1316        if x then
1317            x = x[1]
1318            if x and x.tg == "@pi@" then
1319                local dt = x.dt
1320                local pi = dt and dt[1]
1321                if type(pi) == "string" and find(pi,"^xml") then
1322                    remove(dt,1)
1323                end
1324            end
1325        end
1326    end
1327 -- xc.settings = nil
1328 -- xc.entities = nil
1329 -- xc.special = nil
1330 -- xc.ri = nil
1331 -- print(xc.tg)
1332    return xc
1333end
1334
1335--[[ldx--
1336<p>Packaging data in an xml like table is done with the following
1337function. Maybe it will go away (when not used).</p>
1338--ldx]]--
1339
1340function xml.is_valid(root)
1341    return root and root.dt and root.dt[1] and type(root.dt[1]) == "table" and not root.dt[1].er
1342end
1343
1344function xml.package(tag,attributes,data)
1345    local ns, tg = match(tag,"^(.-):?([^:]+)$")
1346    local t = { ns = ns, tg = tg, dt = data or "", at = attributes or {} }
1347    setmetatable(t, mt)
1348    return t
1349end
1350
1351function xml.is_valid(root)
1352    return root and not root.error
1353end
1354
1355xml.errorhandler = report_xml
1356
1357--[[ldx--
1358<p>We cannot load an <l n='lpeg'/> from a filehandle so we need to load
1359the whole file first. The function accepts a string representing
1360a filename or a file handle.</p>
1361--ldx]]--
1362
1363function xml.load(filename,settings)
1364    local data = ""
1365    if type(filename) == "string" then
1366     -- local data = io.loaddata(filename) -- todo: check type in io.loaddata
1367        local f = io.open(filename,'r') -- why not 'rb'
1368        if f then
1369            data = f:read("*all") -- io.readall(f) ... only makes sense for large files
1370            f:close()
1371        end
1372    elseif filename then -- filehandle
1373        data = filename:read("*all") -- io.readall(f) ... only makes sense for large files
1374    end
1375    if settings then
1376        settings.currentresource = filename
1377        local result = xmlconvert(data,settings)
1378        settings.currentresource = nil
1379        return result
1380    else
1381        return xmlconvert(data,{ currentresource = filename })
1382    end
1383end
1384
1385--[[ldx--
1386<p>When we inject new elements, we need to convert strings to
1387valid trees, which is what the next function does.</p>
1388--ldx]]--
1389
1390local no_root = { no_root = true }
1391
1392function xml.toxml(data)
1393    if type(data) == "string" then
1394        local root = { xmlconvert(data,no_root) }
1395        return (#root > 1 and root) or root[1]
1396    else
1397        return data
1398    end
1399end
1400
1401--[[ldx--
1402<p>For copying a tree we use a dedicated function instead of the
1403generic table copier. Since we know what we're dealing with we
1404can speed up things a bit. The second argument is not to be used!</p>
1405--ldx]]--
1406
1407-- local function copy(old)
1408--     if old then
1409--         local new = { }
1410--         for k,v in next, old do
1411--             if type(v) == "table" then
1412--                 new[k] = table.copy(v)
1413--             else
1414--                 new[k] = v
1415--             end
1416--         end
1417--         local mt = getmetatable(old)
1418--         if mt then
1419--             setmetatable(new,mt)
1420--         end
1421--         return new
1422--     else
1423--         return { }
1424--     end
1425-- end
1426--
1427-- We need to prevent __p__ recursio, so:
1428
1429local function copy(old,p)
1430    if old then
1431        local new = { }
1432        for k, v in next, old do
1433            local t = type(v) == "table"
1434            if k == "at" then
1435                local t = { }
1436                for k, v in next, v do
1437                    t[k] = v
1438                end
1439                new[k] = t
1440            elseif k == "dt" then
1441                v.__p__ = nil
1442                local t = { }
1443                for i=1,#v do
1444                    local vi = v[i]
1445                    if type(vi) == "table" then
1446                        t[i] = copy(vi,new)
1447                    else
1448                        t[i] = vi
1449                    end
1450                end
1451                new[k] = t
1452                t.__p__ = p
1453            else
1454                new[k] = v -- so we also share entities, etc in root
1455            end
1456        end
1457        local mt = getmetatable(old)
1458        if mt then
1459            setmetatable(new,mt)
1460        end
1461        return new
1462    else
1463        return { }
1464    end
1465end
1466
1467xml.copy = copy
1468
1469--[[ldx--
1470<p>In <l n='context'/> serializing the tree or parts of the tree is a major
1471actitivity which is why the following function is pretty optimized resulting
1472in a few more lines of code than needed. The variant that uses the formatting
1473function for all components is about 15% slower than the concatinating
1474alternative.</p>
1475--ldx]]--
1476
1477-- todo: add <?xml version='1.0' standalone='yes'?> when not present
1478
1479function xml.checkbom(root) -- can be made faster
1480    if root.ri then
1481        local dt = root.dt
1482        for k=1,#dt do
1483            local v = dt[k]
1484            if type(v) == "table" and v.special and v.tg == "@pi@" and find(v.dt[1],"xml.*version=") then
1485                return
1486            end
1487        end
1488        insert(dt, 1, { special = true, ns = "", tg = "@pi@", dt = { "xml version='1.0' standalone='yes'" } } )
1489        insert(dt, 2, "\n" )
1490    end
1491end
1492
1493--[[ldx--
1494<p>At the cost of some 25% runtime overhead you can first convert the tree to a string
1495and then handle the lot.</p>
1496--ldx]]--
1497
1498-- new experimental reorganized serialize
1499
1500local f_attribute = formatters['%s=%q']
1501
1502-- we could reuse ats .. for high performance we could also
1503-- have a multiple handle calls instead of multiple arguments
1504-- but it's not that critical
1505
1506local function verbose_element(e,handlers,escape) -- options
1507    local handle = handlers.handle
1508    local serialize = handlers.serialize
1509    local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn
1510    local ats = eat and next(eat) and { }
1511    if ats then
1512        -- we now sort attributes
1513        local n = 0
1514        for k in next, eat do
1515            n = n + 1
1516            ats[n] = k
1517        end
1518        if n == 1 then
1519            local k = ats[1]
1520            ats = f_attribute(k,escaped(eat[k]))
1521        else
1522            sort(ats)
1523            for i=1,n do
1524                local k = ats[i]
1525                ats[i] = f_attribute(k,escaped(eat[k]))
1526            end
1527            ats = concat(ats," ")
1528        end
1529    end
1530    if ern and trace_entities and ern ~= ens then
1531        ens = ern
1532    end
1533    local n = edt and #edt
1534    if ens ~= "" then
1535        if n and n > 0 then
1536            if ats then
1537                handle("<",ens,":",etg," ",ats,">")
1538            else
1539                handle("<",ens,":",etg,">")
1540            end
1541            for i=1,n do
1542                local e = edt[i]
1543                if type(e) == "string" then
1544                    handle(escaped(e))
1545                else
1546                    serialize(e,handlers)
1547                end
1548            end
1549            handle("</",ens,":",etg,">")
1550        else
1551            if ats then
1552                handle("<",ens,":",etg," ",ats,"/>")
1553            else
1554                handle("<",ens,":",etg,"/>")
1555            end
1556        end
1557    else
1558        if n and n > 0 then
1559            if ats then
1560                handle("<",etg," ",ats,">")
1561            else
1562                handle("<",etg,">")
1563            end
1564            for i=1,n do
1565                local e = edt[i]
1566                if type(e) == "string" then
1567                    handle(escaped(e)) -- option: hexify escaped entities
1568                else
1569                    serialize(e,handlers)
1570                end
1571            end
1572            handle("</",etg,">")
1573        else
1574            if ats then
1575                handle("<",etg," ",ats,"/>")
1576            else
1577                handle("<",etg,"/>")
1578            end
1579        end
1580    end
1581end
1582
1583local function verbose_pi(e,handlers)
1584    handlers.handle("<?",e.dt[1],"?>")
1585end
1586
1587local function verbose_comment(e,handlers)
1588    handlers.handle("<!--",e.dt[1],"-->")
1589end
1590
1591local function verbose_cdata(e,handlers)
1592    handlers.handle("<![CDATA[", e.dt[1],"]]>")
1593end
1594
1595local function verbose_doctype(e,handlers)
1596    handlers.handle("<!DOCTYPE",e.dt[1],">") -- has space at end of string
1597end
1598
1599local function verbose_root(e,handlers)
1600    handlers.serialize(e.dt,handlers)
1601end
1602
1603local function verbose_text(e,handlers)
1604    handlers.handle(escaped(e))
1605end
1606
1607local function verbose_document(e,handlers)
1608    local serialize = handlers.serialize
1609    local functions = handlers.functions
1610    for i=1,#e do
1611        local ei = e[i]
1612        if type(ei) == "string" then
1613            functions["@tx@"](ei,handlers)
1614        else
1615            serialize(ei,handlers)
1616        end
1617    end
1618end
1619
1620local function serialize(e,handlers,...)
1621    if e then
1622        local initialize = handlers.initialize
1623        local finalize   = handlers.finalize
1624        local functions  = handlers.functions
1625        if initialize then
1626            local state = initialize(...)
1627            if not state == true then
1628                return state
1629            end
1630        end
1631        local etg = e.tg
1632        if etg then
1633            (functions[etg] or functions["@el@"])(e,handlers)
1634     -- elseif type(e) == "string" then
1635     --     functions["@tx@"](e,handlers)
1636        else
1637            functions["@dc@"](e,handlers) -- dc ?
1638        end
1639        if finalize then
1640            return finalize()
1641        end
1642    end
1643end
1644
1645local function xserialize(e,handlers)
1646    if e then
1647        local functions = handlers.functions
1648        local etg = e.tg
1649        if etg then
1650            (functions[etg] or functions["@el@"])(e,handlers)
1651     -- elseif type(e) == "string" then
1652     --     functions["@tx@"](e,handlers)
1653        else
1654            functions["@dc@"](e,handlers)
1655        end
1656    end
1657end
1658
1659local handlers = { }
1660
1661local function newhandlers(settings)
1662    local t = table.copy(handlers[settings and settings.parent or "verbose"] or { }) -- merge
1663    if settings then
1664        for k,v in next, settings do
1665            if type(v) == "table" then
1666                local tk = t[k] if not tk then tk = { } t[k] = tk end
1667                for kk, vv in next, v do
1668                    tk[kk] = vv
1669                end
1670            else
1671                t[k] = v
1672            end
1673        end
1674        if settings.name then
1675            handlers[settings.name] = t
1676        end
1677    end
1678    utilities.storage.mark(t)
1679    return t
1680end
1681
1682local nofunction = function() end
1683
1684function xml.sethandlersfunction(handler,name,fnc)
1685    handler.functions[name] = fnc or nofunction
1686end
1687
1688function xml.gethandlersfunction(handler,name)
1689    return handler.functions[name]
1690end
1691
1692function xml.gethandlers(name)
1693    return handlers[name]
1694end
1695
1696newhandlers {
1697    name       = "verbose",
1698    initialize = false, -- faster than nil and mt lookup
1699    finalize   = false, -- faster than nil and mt lookup
1700    serialize  = xserialize,
1701    handle     = print,
1702    functions  = {
1703        ["@dc@"]   = verbose_document,
1704        ["@dt@"]   = verbose_doctype,
1705        ["@rt@"]   = verbose_root,
1706        ["@el@"]   = verbose_element,
1707        ["@pi@"]   = verbose_pi,
1708        ["@cm@"]   = verbose_comment,
1709        ["@cd@"]   = verbose_cdata,
1710        ["@tx@"]   = verbose_text,
1711    }
1712}
1713
1714--[[ldx--
1715<p>How you deal with saving data depends on your preferences. For a 40 MB database
1716file the timing on a 2.3 Core Duo are as follows (time in seconds):</p>
1717
1718<lines>
17191.3 : load data from file to string
17206.1 : convert string into tree
17215.3 : saving in file using xmlsave
17226.8 : converting to string using xml.tostring
17233.6 : saving converted string in file
1724</lines>
1725
1726<p>Beware, these were timing with the old routine but measurements will not be that
1727much different I guess.</p>
1728--ldx]]--
1729
1730-- maybe this will move to lxml-xml
1731
1732local result
1733
1734local xmlfilehandler = newhandlers {
1735    name       = "file",
1736    initialize = function(name)
1737        result = io.open(name,"wb")
1738        return result
1739    end,
1740    finalize   = function()
1741        result:close()
1742        return true
1743    end,
1744    handle     = function(...)
1745        result:write(...)
1746    end,
1747}
1748
1749-- no checking on writeability here but not faster either
1750--
1751-- local xmlfilehandler = newhandlers {
1752--     initialize = function(name)
1753--         io.output(name,"wb")
1754--         return true
1755--     end,
1756--     finalize   = function()
1757--         io.close()
1758--         return true
1759--     end,
1760--     handle     = io.write,
1761-- }
1762
1763function xml.save(root,name)
1764    serialize(root,xmlfilehandler,name)
1765end
1766
1767-- local result
1768--
1769-- local xmlstringhandler = newhandlers {
1770--     name       = "string",
1771--     initialize = function()
1772--         result = { }
1773--         return result
1774--     end,
1775--     finalize   = function()
1776--         return concat(result)
1777--     end,
1778--     handle     = function(...)
1779--         result[#result+1] = concat { ... }
1780--     end,
1781-- }
1782
1783local result, r, threshold = { }, 0, 512
1784
1785local xmlstringhandler = newhandlers {
1786    name       = "string",
1787    initialize = function()
1788        r = 0
1789        return result
1790    end,
1791    finalize   = function()
1792        local done = concat(result,"",1,r)
1793        r = 0
1794        if r > threshold then
1795            result = { }
1796        end
1797        return done
1798    end,
1799    handle     = function(...)
1800        for i=1,select("#",...) do
1801            r = r + 1
1802            result[r] = select(i,...)
1803        end
1804    end,
1805}
1806
1807local function xmltostring(root) -- 25% overhead due to collecting
1808    if not root then
1809        return ""
1810    elseif type(root) == "string" then
1811        return root
1812    else -- if next(root) then -- next is faster than type (and >0 test)
1813        return serialize(root,xmlstringhandler) or ""
1814    end
1815end
1816
1817local function __tostring(root) -- inline
1818    return (root and xmltostring(root)) or ""
1819end
1820
1821initialize_mt = function(root) -- redefinition
1822    mt = { __tostring = __tostring, __index = root }
1823end
1824
1825xml.defaulthandlers = handlers
1826xml.newhandlers     = newhandlers
1827xml.serialize       = serialize
1828xml.tostring        = xmltostring
1829
1830--[[ldx--
1831<p>The next function operated on the content only and needs a handle function
1832that accepts a string.</p>
1833--ldx]]--
1834
1835local function xmlstring(e,handle)
1836    if not handle or (e.special and e.tg ~= "@rt@") then
1837        -- nothing
1838    elseif e.tg then
1839        local edt = e.dt
1840        if edt then
1841            for i=1,#edt do
1842                xmlstring(edt[i],handle)
1843            end
1844        end
1845    else
1846        handle(e)
1847    end
1848end
1849
1850xml.string = xmlstring
1851
1852--[[ldx--
1853<p>A few helpers:</p>
1854--ldx]]--
1855
1856--~ xmlsetproperty(root,"settings",settings)
1857
1858function xml.settings(e)
1859    while e do
1860        local s = e.settings
1861        if s then
1862            return s
1863        else
1864            e = e.__p__
1865        end
1866    end
1867    return nil
1868end
1869
1870function xml.root(e)
1871    local r = e
1872    while e do
1873        e = e.__p__
1874        if e then
1875            r = e
1876        end
1877    end
1878    return r
1879end
1880
1881function xml.parent(root)
1882    return root.__p__
1883end
1884
1885function xml.body(root)
1886    return root.ri and root.dt[root.ri] or root -- not ok yet
1887end
1888
1889function xml.name(root)
1890    if not root then
1891        return ""
1892    end
1893    local ns = root.ns
1894    local tg = root.tg
1895    if ns == "" then
1896        return tg
1897    else
1898        return ns .. ":" .. tg
1899    end
1900end
1901
1902--[[ldx--
1903<p>The next helper erases an element but keeps the table as it is,
1904and since empty strings are not serialized (effectively) it does
1905not harm. Copying the table would take more time. Usage:</p>
1906--ldx]]--
1907
1908function xml.erase(dt,k)
1909    if dt then
1910        if k then
1911            dt[k] = ""
1912        else for k=1,#dt do
1913            dt[1] = { "" }
1914        end end
1915    end
1916end
1917
1918--[[ldx--
1919<p>The next helper assigns a tree (or string). Usage:</p>
1920
1921<typing>
1922dt[k] = xml.assign(root) or xml.assign(dt,k,root)
1923</typing>
1924--ldx]]--
1925
1926function xml.assign(dt,k,root)
1927    if dt and k then
1928        dt[k] = type(root) == "table" and xml.body(root) or root
1929        return dt[k]
1930    else
1931        return xml.body(root)
1932    end
1933end
1934
1935-- the following helpers may move
1936
1937--[[ldx--
1938<p>The next helper assigns a tree (or string). Usage:</p>
1939<typing>
1940xml.tocdata(e)
1941xml.tocdata(e,"error")
1942</typing>
1943--ldx]]--
1944
1945function xml.tocdata(e,wrapper) -- a few more in the aux module
1946    local whatever = type(e) == "table" and xmltostring(e.dt) or e or ""
1947    if wrapper then
1948        whatever = formatters["<%s>%s</%s>"](wrapper,whatever,wrapper)
1949    end
1950    local t = { special = true, ns = "", tg = "@cd@", at = { }, rn = "", dt = { whatever }, __p__ = e }
1951    setmetatable(t,getmetatable(e))
1952    e.dt = { t }
1953end
1954
1955function xml.makestandalone(root)
1956    if root.ri then
1957        local dt = root.dt
1958        for k=1,#dt do
1959            local v = dt[k]
1960            if type(v) == "table" and v.special and v.tg == "@pi@" then
1961                local txt = v.dt[1]
1962                if find(txt,"xml.*version=") then
1963                    v.dt[1] = txt .. " standalone='yes'"
1964                    break
1965                end
1966            end
1967        end
1968    end
1969    return root
1970end
1971
1972function xml.kind(e)
1973    local dt = e and e.dt
1974    if dt then
1975        local n = #dt
1976        if n == 1 then
1977            local d = dt[1]
1978            if d.special then
1979                local tg = d.tg
1980                if tg == "@cd@" then
1981                    return "cdata"
1982                elseif tg == "@cm@" then
1983                    return "comment"
1984                elseif tg == "@pi@" then
1985                    return "instruction"
1986                elseif tg == "@dt@" then
1987                    return "declaration"
1988                end
1989            elseif type(d) == "string" then
1990                return "text"
1991            end
1992            return "element"
1993        elseif n > 0 then
1994            return "mixed"
1995        end
1996    end
1997    return "empty"
1998end
1999