lxml-tab.lua /size: 60 Kb    last modification: 2024-01-16 09:02
1if not modules then modules = { } end modules ['lxml-tab'] = {
2    version   = 1.001,
3    comment   = "this module is the basis for the lxml-* ones",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc
10-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the
11-- trouble
12
13-- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua)
14-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
15-- of work so we delay this till we cleanup
16
17local trace_entities = false  trackers.register("xml.entities", function(v) trace_entities = v end)
18
19local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end
20
21-- The parser used here is inspired by the variant discussed in the lua book, but
22-- handles comment and processing instructions, has a different structure, provides
23-- parent access; a first version used different trickery but was less optimized to
24-- we went this route. First we had a find based parser, now we have an LPEG based
25-- one. The find based parser can be found in l-xml-edu.lua along with other older
26-- code.
27
28if lpeg.setmaxstack then lpeg.setmaxstack(1000) end -- deeply nested xml files
29
30xml = xml or { }
31local xml = xml
32
33--~ local xml = xml
34
35local concat, remove, insert = table.concat, table.remove, table.insert
36local type, next, setmetatable, getmetatable, tonumber, rawset, select = type, next, setmetatable, getmetatable, tonumber, rawset, select
37local lower, find, match, gsub = string.lower, string.find, string.match, string.gsub
38local sort = table.sort
39local utfchar = utf.char
40local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
41local P, S, R, C, V, C, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.C, lpeg.Cs
42local formatters = string.formatters
43
44-- First a hack to enable namespace resolving. A namespace is characterized by a
45-- URL. The following function associates a namespace prefix with a pattern. We use
46-- LPEG, which in this case is more than twice as fast as a find based solution
47-- where we loop over an array of patterns. Less code and much cleaner.
48
49do -- begin of namespace closure (we ran out of locals)
50
51xml.xmlns = xml.xmlns or { }
52
53-- The next function associates a namespace prefix with an URL. This normally
54-- happens independent of parsing.
55--
56--   xml.registerns("mml","mathml")
57
58local check = P(false)
59local parse = check
60
61function xml.registerns(namespace, pattern) -- pattern can be an lpeg
62    check = check + C(P(lower(pattern))) / namespace
63    parse = P { P(check) + 1 * V(1) }
64end
65
66-- The next function also registers a namespace, but this time we map a given
67-- namespace prefix onto a registered one, using the given URL. This used for
68-- attributes like 'xmlns:m'.
69--
70--   xml.checkns("m","http://www.w3.org/mathml")
71
72function xml.checkns(namespace,url)
73    local ns = lpegmatch(parse,lower(url))
74    if ns and namespace ~= ns then
75        xml.xmlns[namespace] = ns
76    end
77end
78
79-- Next we provide a way to turn an URL into a registered namespace. This used for
80-- the 'xmlns' attribute.
81--
82--  resolvedns = xml.resolvens("http://www.w3.org/mathml")
83--
84-- This returns MATHML.
85
86function xml.resolvens(url)
87     return lpegmatch(parse,lower(url)) or ""
88end
89
90-- A namespace in an element can be remapped onto the registered one efficiently by
91-- using the 'xml.xmlns' table.
92
93end -- end of namespace closure
94
95-- This version uses LPEG. We follow the same approach as before, stack and top and
96-- such. This version is about twice as fast which is mostly due to the fact that we
97-- don't have to prepare the stream for cdata, doctype etc etc. This variant is is
98-- dedicated to Luigi Scarso, who challenged me with 40 megabyte XML files that took
99-- 12.5 seconds to load (1.5 for file io and the rest for tree building). With the
100-- LPEG implementation we got that down to less 7.3 seconds. Loading the 14 ConTeXt
101-- interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.
102--
103-- Next comes the parser. The rather messy doctype definition comes in many
104-- disguises so it is no surprice that later on have to dedicate quite some LPEG
105-- code to it.
106--
107--  <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
108--  <!DOCTYPE Something PUBLIC "... ..." "..." >
109--  <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
110--  <!DOCTYPE Something SYSTEM "... ..." >
111--  <!DOCTYPE Something [ ... ] >
112--  <!DOCTYPE Something >
113--
114-- The code may look a bit complex but this is mostly due to the fact that we
115-- resolve namespaces and attach metatables. There is only one public function:
116--
117--   local x = xml.convert(somestring)
118--
119-- An optional second boolean argument tells this function not to create a root
120-- element.
121--
122-- Valid entities are:
123--
124--   <!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
125--   <!ENTITY xxxx PUBLIC "yyyy" >
126--   <!ENTITY xxxx "yyyy" >
127
128-- not just one big nested table capture (lpeg overflow)
129
130local nsremap, resolvens = xml.xmlns, xml.resolvens
131
132local stack, level, top, at, xmlnms, errorstr
133local entities, parameters
134local strip, utfize, resolve, cleanup, resolve_predefined, unify_predefined
135local dcache, hcache, acache
136local mt, dt, nt
137local currentfilename, currentline, linenumbers
138
139local grammar_parsed_text_one
140local grammar_parsed_text_two
141local grammar_unparsed_text
142
143local handle_hex_entity
144local handle_dec_entity
145local handle_any_entity_dtd
146local handle_any_entity_text
147
148local function preparexmlstate(settings)
149    if settings then
150        linenumbers        = settings.linenumbers
151        stack              = { }
152        level              = 0
153        top                = { }
154        at                 = { }
155        mt                 = { }
156        dt                 = { }
157        nt                 = 0   -- some 5% faster than #dt on cont-en.xml
158        xmlns              = { }
159        errorstr           = nil
160        strip              = settings.strip_cm_and_dt
161        utfize             = settings.utfize_entities
162        resolve            = settings.resolve_entities            -- enable this in order to apply the dtd
163        resolve_predefined = settings.resolve_predefined_entities -- in case we have escaped entities
164        unify_predefined   = settings.unify_predefined_entities   -- &#038; -> &amp;
165        cleanup            = settings.text_cleanup
166        entities           = settings.entities or { }
167        currentfilename    = settings.currentresource
168        currentline        = 1
169        parameters         = { }
170        reported_at_errors = { }
171        dcache             = { }
172        hcache             = { }
173        acache             = { }
174        if utfize == nil then
175            settings.utfize_entities = true
176            utfize = true
177        end
178        if resolve_predefined == nil then
179            settings.resolve_predefined_entities = true
180            resolve_predefined = true
181        end
182    else
183        linenumbers        = false
184        stack              = nil
185        level              = nil
186        top                = nil
187        at                 = nil
188        mt                 = nil
189        dt                 = nil
190        nt                 = nil
191        xmlns              = nil
192        errorstr           = nil
193        strip              = nil
194        utfize             = nil
195        resolve            = nil
196        resolve_predefined = nil
197        unify_predefined   = nil
198        cleanup            = nil
199        entities           = nil
200        parameters         = nil
201        reported_at_errors = nil
202        dcache             = nil
203        hcache             = nil
204        acache             = nil
205        currentfilename    = nil
206        currentline        = 1
207    end
208end
209
210local function initialize_mt(root)
211    mt = { __index = root } -- will be redefined later
212end
213
214function xml.setproperty(root,k,v)
215    getmetatable(root).__index[k] = v
216end
217
218function xml.checkerror(top,toclose)
219    return "" -- can be set
220end
221
222local checkns = xml.checkns
223
224local function add_attribute(namespace,tag,value)
225    if cleanup and value ~= "" then
226        value = cleanup(value) -- new
227    end
228    if tag == "xmlns" then
229        xmlns[#xmlns+1] = resolvens(value)
230        at[tag] = value
231    elseif namespace == "" then
232        at[tag] = value
233    elseif namespace == "xmlns" then
234        checkns(tag,value)
235        at["xmlns:" .. tag] = value
236    else
237        -- for the moment this way:
238        at[namespace .. ":" .. tag] = value
239    end
240end
241
242local function add_empty(spacing, namespace, tag)
243    if spacing ~= "" then
244        nt = nt + 1
245        dt[nt] = spacing
246    end
247    local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
248    top = stack[level]
249    dt = top.dt
250    nt = #dt + 1
251    local t = linenumbers and {
252        ns = namespace or "",
253        rn = resolved,
254        tg = tag,
255        at = at,
256        dt = { },
257        ni = nt, -- set slot, needed for css filtering
258        cf = currentfilename,
259        cl = currentline,
260        __p__ = top,
261    } or {
262        ns = namespace or "",
263        rn = resolved,
264        tg = tag,
265        at = at,
266        dt = { },
267        ni = nt, -- set slot, needed for css filtering
268        __p__ = top,
269    }
270    dt[nt] = t
271    setmetatable(t, mt)
272    if at.xmlns then
273        remove(xmlns)
274    end
275    at = { }
276end
277
278local function add_begin(spacing, namespace, tag)
279    if spacing ~= "" then
280        nt = nt + 1
281        dt[nt] = spacing
282    end
283    local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
284    dt = { }
285    top = linenumbers and {
286        ns = namespace or "",
287        rn = resolved,
288        tg = tag,
289        at = at,
290        dt = dt,
291        ni = nil, -- preset slot, needed for css filtering
292        cf = currentfilename,
293        cl = currentline,
294        __p__ = stack[level],
295    } or {
296        ns = namespace or "",
297        rn = resolved,
298        tg = tag,
299        at = at,
300        dt = dt,
301        ni = nil, -- preset slot, needed for css filtering
302        __p__ = stack[level],
303    }
304    setmetatable(top, mt)
305    nt = 0
306    level = level + 1
307    stack[level] = top
308    at = { }
309end
310
311local function add_end(spacing, namespace, tag)
312    if spacing ~= "" then
313        nt = nt + 1
314        dt[nt] = spacing
315    end
316    local toclose = stack[level]
317    level = level - 1
318    top = stack[level]
319    if level < 1 then
320        errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "")
321        report_xml(errorstr)
322    elseif toclose.tg ~= tag then -- no namespace check
323        errorstr = formatters["unable to close %s with %s %s"](toclose.tg,tag,xml.checkerror(top,toclose) or "")
324        report_xml(errorstr)
325    end
326    dt = top.dt
327    nt = #dt + 1
328    dt[nt] = toclose
329    toclose.ni = nt -- update slot, needed for css filtering
330    if toclose.at.xmlns then
331        remove(xmlns)
332    end
333end
334
335-- local spaceonly = lpegpatterns.whitespace^0 * P(-1)
336--
337-- will be an option: dataonly
338--
339-- if #text == 0 or     lpegmatch(spaceonly,text) then
340--     return
341-- end
342
343local function add_text(text)
344    if text == "" then
345        return
346    elseif cleanup then
347        if nt > 0 then
348            local s = dt[nt]
349            if type(s) == "string" then
350                dt[nt] = s .. cleanup(text)
351            else
352                nt = nt + 1
353                dt[nt] = cleanup(text)
354            end
355        else
356            nt = 1
357            dt[1] = cleanup(text)
358        end
359    else
360        if nt > 0 then
361            local s = dt[nt]
362            if type(s) == "string" then
363                dt[nt] = s .. text
364            else
365                nt = nt + 1
366                dt[nt] = text
367            end
368        else
369            nt = 1
370            dt[1] = text
371        end
372    end
373end
374
375local function add_special(what, spacing, text)
376    if spacing ~= "" then
377        nt = nt + 1
378        dt[nt] = spacing
379    end
380    if strip and (what == "@cm@" or what == "@dt@") then
381        -- forget it
382    else
383        nt = nt + 1
384        dt[nt] = linenumbers and {
385            special = true,
386            ns      = "",
387            tg      = what,
388            ni      = nil, -- preset slot
389            dt      = { text },
390            cf      = currentfilename,
391            cl      = currentline,
392        } or {
393            special = true,
394            ns      = "",
395            tg      = what,
396            ni      = nil, -- preset slot
397            dt      = { text },
398        }
399    end
400end
401
402local function set_message(txt)
403    errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","")
404end
405
406local function attribute_value_error(str)
407    if not reported_at_errors[str] then
408        report_xml("invalid attribute value %a",str)
409        reported_at_errors[str] = true
410        at._error_ = str
411    end
412    return str
413end
414
415local function attribute_specification_error(str)
416    if not reported_at_errors[str] then
417        report_xml("invalid attribute specification %a",str)
418        reported_at_errors[str] = true
419        at._error_ = str
420    end
421    return str
422end
423
424-- I'm sure that this lpeg can be simplified (less captures) but it evolved ...
425-- so i'm not going to change it now.
426
427do
428
429    -- In order to overcome lua limitations we wrap entity stuff in a closure.
430
431    local badentity = "&" -- was "&error;"
432
433    xml.placeholders = {
434        unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end,
435        unknown_hex_entity = function(str) return formatters["&#x%s;"](str) end,
436        unknown_any_entity = function(str) return formatters["&#x%s;"](str) end,
437    }
438
439    local function fromhex(s)
440        local n = tonumber(s,16)
441        if n then
442            return utfchar(n)
443        else
444            return formatters["h:%s"](s), true
445        end
446    end
447
448    local function fromdec(s)
449        local n = tonumber(s)
450        if n then
451            return utfchar(n)
452        else
453            return formatters["d:%s"](s), true
454        end
455    end
456
457    local p_rest = (1-P(";"))^0
458    local p_many = P(1)^0
459
460    local parsedentity =
461        P("&#") * (P("x")*(p_rest/fromhex) + (p_rest/fromdec)) * P(";") * P(-1) +
462        P ("#") * (P("x")*(p_many/fromhex) + (p_many/fromdec))
463
464    xml.parsedentitylpeg = parsedentity
465
466    -- parsing in the xml file
467
468    local predefined_unified = {
469        [38] = "&amp;",
470        [42] = "&quot;",
471        [47] = "&apos;",
472        [74] = "&lt;",
473        [76] = "&gt;",
474    }
475
476    local predefined_simplified = {
477        [38] = "&", amp  = "&",
478        [42] = '"', quot = '"',
479        [47] = "'", apos = "'",
480        [74] = "<", lt   = "<",
481        [76] = ">", gt   = ">",
482    }
483
484    local nofprivates = 0xF0000 -- shared but seldom used
485
486    local privates_u = { -- unescaped
487        [ [[&]] ] = "&amp;",
488        [ [["]] ] = "&quot;",
489        [ [[']] ] = "&apos;",
490        [ [[<]] ] = "&lt;",
491        [ [[>]] ] = "&gt;",
492    }
493
494    local privates_p = { -- needed for roundtrip as well as serialize to tex
495    }
496
497    local privates_s = { -- for tex
498        [ [["]] ] = "&U+22;",
499        [ [[#]] ] = "&U+23;",
500        [ [[$]] ] = "&U+24;",
501        [ [[%]] ] = "&U+25;",
502        [ [[&]] ] = "&U+26;",
503        [ [[']] ] = "&U+27;",
504        [ [[<]] ] = "&U+3C;",
505        [ [[>]] ] = "&U+3E;",
506        [ [[\]] ] = "&U+5C;",
507        [ [[{]] ] = "&U+7B;",
508        [ [[|]] ] = "&U+7C;",
509        [ [[}]] ] = "&U+7D;",
510        [ [[~]] ] = "&U+7E;",
511    }
512
513    local privates_x = { -- for xml
514        [ [["]] ] = "&U+22;",
515        [ [[#]] ] = "&U+23;",
516        [ [[$]] ] = "&U+24;",
517        [ [[%]] ] = "&U+25;",
518        [ [[']] ] = "&U+27;",
519        [ [[\]] ] = "&U+5C;",
520        [ [[{]] ] = "&U+7B;",
521        [ [[|]] ] = "&U+7C;",
522        [ [[}]] ] = "&U+7D;",
523        [ [[~]] ] = "&U+7E;",
524    }
525
526    local privates_n = {
527        -- keeps track of defined ones
528    }
529
530    utilities.storage.mark(privates_u)
531    utilities.storage.mark(privates_p)
532    utilities.storage.mark(privates_s)
533    utilities.storage.mark(privates_x)
534    utilities.storage.mark(privates_n)
535
536    local escaped       = utf.remapper(privates_u,"dynamic")
537    local unprivatized  = utf.remapper(privates_p,"dynamic")
538    local unspecialized = utf.remapper(privates_s,"dynamic")
539    local despecialized = utf.remapper(privates_x,"dynamic")
540
541    xml.unprivatized  = unprivatized
542    xml.unspecialized = unspecialized
543    xml.despecialized = despecialized
544    xml.escaped       = escaped
545
546    local function unescaped(s)
547        local p = privates_n[s]
548        if not p then
549            nofprivates = nofprivates + 1
550            p = utfchar(nofprivates)
551            privates_n[s] = p
552            s = "&" .. s .. ";" -- todo: use char-ent to map to hex
553            privates_u[p] = s
554            privates_p[p] = s
555            privates_s[p] = s
556        end
557        return p
558    end
559
560    xml.privatetoken = unescaped
561    xml.privatecodes = privates_n
562    xml.specialcodes = privates_s
563
564    function xml.addspecialcode(key,value)
565        privates_s[key] = value or "&" .. s .. ";"
566    end
567
568    handle_hex_entity = function(str)
569        local h = hcache[str]
570        if not h then
571            local n = tonumber(str,16)
572            h = unify_predefined and predefined_unified[n]
573            if h then
574                if trace_entities then
575                    report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
576                end
577            elseif utfize then
578                h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
579                if not n then
580                    report_xml("utfize, ignoring hex entity &#x%s;",str)
581                elseif trace_entities then
582                    report_xml("utfize, converting hex entity &#x%s; into %a",str,h)
583                end
584            else
585                if trace_entities then
586                    report_xml("found entity &#x%s;",str)
587                end
588                h = "&#x" .. str .. ";"
589            end
590            hcache[str] = h
591        end
592        return h
593    end
594
595    handle_dec_entity = function(str)
596        local d = dcache[str]
597        if not d then
598            local n = tonumber(str)
599            d = unify_predefined and predefined_unified[n]
600            if d then
601                if trace_entities then
602                    report_xml("utfize, converting dec entity &#%s; into %a",str,d)
603                end
604            elseif utfize then
605                d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
606                if not n then
607                    report_xml("utfize, ignoring dec entity &#%s;",str)
608                elseif trace_entities then
609                    report_xml("utfize, converting dec entity &#%s; into %a",str,d)
610                end
611            else
612                if trace_entities then
613                    report_xml("found entity &#%s;",str)
614                end
615                d = "&#" .. str .. ";"
616            end
617            dcache[str] = d
618        end
619        return d
620    end
621
622    handle_any_entity_dtd = function(str)
623        if resolve then
624            local a = resolve_predefined and predefined_simplified[str] -- true by default
625            if a then
626                if trace_entities then
627                    report_xml("resolving entity &%s; to predefined %a",str,a)
628                end
629            else
630                if type(resolve) == "function" then
631                    a = resolve(str,entities) or entities[str]
632                else
633                    a = entities[str]
634                end
635                if a then
636                    if type(a) == "function" then
637                        if trace_entities then
638                            report_xml("expanding entity &%s; to function call",str)
639                        end
640                        a = a(str) or ""
641                    end
642                    a = lpegmatch(parsedentity,a) or a -- for nested
643                    if trace_entities then
644                        report_xml("resolving entity &%s; to internal %a",str,a)
645                    end
646                else
647                    local unknown_any_entity = placeholders.unknown_any_entity
648                    if unknown_any_entity then
649                        a = unknown_any_entity(str) or ""
650                    end
651                    if a then
652                        if trace_entities then
653                            report_xml("resolving entity &%s; to external %s",str,a)
654                        end
655                    else
656                        if trace_entities then
657                            report_xml("keeping entity &%s;",str)
658                        end
659                        if str == "" then
660                            a = badentity
661                        else
662                            a = "&" .. str .. ";"
663                        end
664                    end
665                end
666            end
667            return a
668        else
669            local a = acache[str]
670            if not a then
671                a = resolve_predefined and predefined_simplified[str]
672                if a then
673                    -- one of the predefined
674                    acache[str] = a
675                    if trace_entities then
676                        report_xml("entity &%s; becomes %a",str,a)
677                    end
678                elseif str == "" then
679                    if trace_entities then
680                        report_xml("invalid entity &%s;",str)
681                    end
682                    a = badentity
683                    acache[str] = a
684                else
685                    if trace_entities then
686                        report_xml("entity &%s; is made private",str)
687                    end
688                 -- a = "&" .. str .. ";"
689                    a = unescaped(str)
690                    acache[str] = a
691                end
692            end
693            return a
694        end
695    end
696
697    handle_any_entity_text = function(str)
698        if resolve then
699            local a = resolve_predefined and predefined_simplified[str]
700            if a then
701                if trace_entities then
702                    report_xml("resolving entity &%s; to predefined %a",str,a)
703                end
704            else
705                if type(resolve) == "function" then
706                    a = resolve(str,entities) or entities[str]
707                else
708                    a = entities[str]
709                end
710                if a then
711                    if type(a) == "function" then
712                        if trace_entities then
713                            report_xml("expanding entity &%s; to function call",str)
714                        end
715                        a = a(str) or ""
716                    end
717                    a = lpegmatch(grammar_parsed_text_two,a) or a
718                    if type(a) == "number" then
719                        return ""
720                    else
721                        a = lpegmatch(parsedentity,a) or a -- for nested
722                        if trace_entities then
723                            report_xml("resolving entity &%s; to internal %a",str,a)
724                        end
725                    end
726                    if trace_entities then
727                        report_xml("resolving entity &%s; to internal %a",str,a)
728                    end
729                else
730                    local unknown_any_entity = placeholders.unknown_any_entity
731                    if unknown_any_entity then
732                        a = unknown_any_entity(str) or ""
733                    end
734                    if a then
735                        if trace_entities then
736                            report_xml("resolving entity &%s; to external %s",str,a)
737                        end
738                    else
739                        if trace_entities then
740                            report_xml("keeping entity &%s;",str)
741                        end
742                        if str == "" then
743                            a = badentity
744                        else
745                            a = "&" .. str .. ";"
746                        end
747                    end
748                end
749            end
750            return a
751        else
752            local a = acache[str]
753            if not a then
754                a = resolve_predefined and predefined_simplified[str]
755                if a then
756                    -- one of the predefined
757                    acache[str] = a
758                    if trace_entities then
759                        report_xml("entity &%s; becomes %a",str,a)
760                    end
761                elseif str == "" then
762                    if trace_entities then
763                        report_xml("invalid entity &%s;",str)
764                    end
765                    a = badentity
766                    acache[str] = a
767                else
768                    if trace_entities then
769                        report_xml("entity &%s; is made private",str)
770                    end
771                 -- a = "&" .. str .. ";"
772                    a = unescaped(str)
773                    acache[str] = a
774                end
775            end
776            return a
777        end
778    end
779
780    -- for tex
781
782    local p_rest = (1-P(";"))^1
783
784    local spec = {
785        [0x23] = "\\Ux{23}", -- #
786        [0x24] = "\\Ux{24}", -- $
787        [0x25] = "\\Ux{25}", -- %
788        [0x5C] = "\\Ux{5C}", -- \
789        [0x7B] = "\\Ux{7B}", -- {
790        [0x7C] = "\\Ux{7C}", -- |
791        [0x7D] = "\\Ux{7D}", -- }
792        [0x7E] = "\\Ux{7E}", -- ~
793    }
794
795    local hash = table.setmetatableindex(spec,function(t,k)
796        local v = utfchar(k)
797        t[k] = v
798        return v
799    end)
800
801    local function fromuni(s)
802        local n = tonumber(s,16)
803        if n then
804            return hash[n]
805        else
806            return formatters["u:%s"](s), true
807        end
808    end
809
810    local function fromhex(s)
811        local n = tonumber(s,16)
812        if n then
813            return hash[n]
814        else
815            return formatters["h:%s"](s), true
816        end
817    end
818
819    local function fromdec(s)
820        local n = tonumber(s)
821        if n then
822            return hash[n]
823        else
824            return formatters["d:%s"](s), true
825        end
826    end
827
828    local reparsedentity =
829        P("U+") * (p_rest/fromuni)
830      + P("#")  * (
831            P("x") * (p_rest/fromhex)
832          + p_rest/fromdec
833        )
834
835    local hash = table.setmetatableindex(function(t,k)
836        local v = utfchar(k)
837        t[k] = v
838        return v
839    end)
840
841    local function fromuni(s)
842        local n = tonumber(s,16)
843        if n then
844            return hash[n]
845        else
846            return formatters["u:%s"](s), true
847        end
848    end
849
850    local function fromhex(s)
851        local n = tonumber(s,16)
852        if n then
853            return hash[n]
854        else
855            return formatters["h:%s"](s), true
856        end
857    end
858
859    local function fromdec(s)
860        local n = tonumber(s)
861        if n then
862            return hash[n]
863        else
864            return formatters["d:%s"](s), true
865        end
866    end
867
868    local unescapedentity =
869        P("U+") * (p_rest/fromuni)
870      + P("#")  * (
871            P("x") * (p_rest/fromhex)
872          + p_rest/fromdec
873        )
874
875    xml.reparsedentitylpeg  = reparsedentity   -- with \Ux{...} for special tex entities
876    xml.unescapedentitylpeg = unescapedentity  -- normal characters
877
878end
879
880-- we use these later on
881
882local escaped      = xml.escaped
883local unescaped    = xml.unescaped
884local placeholders = xml.placeholders
885
886--
887
888local function handle_end_entity(str)
889    report_xml("error in entity, %a found without ending %a",str,";")
890    return str
891end
892
893local function handle_crap_error(chr)
894    report_xml("error in parsing, unexpected %a found ",chr)
895    add_text(chr)
896    return chr
897end
898
899local function handlenewline()
900    currentline = currentline + 1
901end
902
903-- first = ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#x00F8-#x02FF] |
904--         [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
905--         [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
906--         [#x10000-#xEFFFF]
907-- rest  = "-" | "." | [0-9] | #xB7 | [#x300-#x36F] | [#x203F-#x2040]
908-- name  = first + (first + rest)^1
909--
910-- We assume utf and do no real checking!
911
912local spacetab         = S(' \t')
913local space            = S(' \r\n\t')
914local newline          = lpegpatterns.newline / handlenewline
915local anything         = P(1)
916local open             = P('<')
917local close            = P('>')
918local squote           = S("'")
919local dquote           = S('"')
920local equal            = P('=')
921local slash            = P('/')
922local colon            = P(':')
923local semicolon        = P(';')
924local ampersand        = P('&')
925----- valid_0          = lpegpatterns.utf8two + lpegpatterns.utf8three + lpegpatterns.utf8four
926local valid_0          = R("\128\255") -- basically any encoding without checking (fast)
927local valid_1          = R('az', 'AZ') + S('_') + valid_0
928local valid_2          = valid_1 + R('09') + S('-.')
929local valid            = valid_1 * valid_2^0
930local name_yes         = C(valid^1) * colon * C(valid^1)
931local name_nop         = C(P(true)) * C(valid^1)
932local name             = name_yes + name_nop
933local utfbom           = lpegpatterns.utfbom -- no capture
934local spacing          = C(space^0)
935
936local space_nl         = spacetab + newline
937local spacing_nl       = Cs((space_nl)^0)
938local anything_nl      = newline + P(1)
939
940local function weirdentity(k,v)
941    if trace_entities then
942        report_xml("registering %s entity %a as %a","weird",k,v)
943    end
944    parameters[k] = v
945end
946local function normalentity(k,v)
947    if trace_entities then
948        report_xml("registering %s entity %a as %a","normal",k,v)
949    end
950    entities[k] = v
951end
952local function systementity(k,v,n)
953    if trace_entities then
954        report_xml("registering %s entity %a as %a","system",k,v)
955    end
956    entities[k] = v
957end
958local function publicentity(k,v,n)
959    if trace_entities then
960        report_xml("registering %s entity %a as %a","public",k,v)
961    end
962    entities[k] = v
963end
964local function entityfile(pattern,k,v,n)
965    if n then
966        local okay, data
967        local loadbinfile = resolvers and resolvers.loadbinfile
968        if loadbinfile then
969            okay, data = loadbinfile(n)
970        else
971            data = io.loaddata(n)
972            okay = data and data ~= ""
973        end
974        if okay then
975            if trace_entities then
976                report_xml("loading public entities %a as %a from %a",k,v,n)
977            end
978            lpegmatch(pattern,data)
979            return
980        end
981    end
982    report_xml("ignoring public entities %a as %a from %a",k,v,n)
983end
984
985local function install(spacenewline,spacing,anything)
986
987    local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0
988    local hexentitycontent = R("AF","af","09")^1
989    local decentitycontent = R("09")^1
990    local parsedentity     = P("#")/"" * (
991                                    P("x")/"" * (hexentitycontent/handle_hex_entity) +
992                                                (decentitycontent/handle_dec_entity)
993                                ) +             (anyentitycontent/handle_any_entity_dtd) -- can be Cc(true)
994    local parsedentity_text= P("#")/"" * (
995                                    P("x")/"" * (hexentitycontent/handle_hex_entity) +
996                                                (decentitycontent/handle_dec_entity)
997                                ) +             (anyentitycontent/handle_any_entity_text) -- can be Cc(false)
998    local entity           = (ampersand/"") * parsedentity   * (semicolon/"")
999                           + ampersand * (anyentitycontent / handle_end_entity)
1000    local entity_text      = (ampersand/"") * parsedentity_text * (semicolon/"")
1001                           + ampersand * (anyentitycontent / handle_end_entity)
1002
1003    local text_unparsed    = Cs((anything-open)^1)
1004    local text_parsed      = (Cs((anything-open-ampersand)^1)/add_text + Cs(entity_text)/add_text)^1
1005--     local text_parsed      = ((Cs(((anything-open-ampersand)^1) + entity_text))/add_text)^1
1006
1007    local somespace        = (spacenewline)^1
1008    local optionalspace    = (spacenewline)^0
1009
1010    local value            = (squote * Cs((entity + (anything - squote))^0) * squote) + (dquote * Cs((entity + (anything - dquote))^0) * dquote) -- ampersand and < also invalid in value
1011
1012    local endofattributes  = slash * close + close -- recovery of flacky html
1013    local whatever         = space * name * optionalspace * equal
1014    local wrongvalue       = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
1015
1016    local attributevalue   = value + wrongvalue
1017
1018    local attribute        = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
1019
1020 -- local attributes       = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
1021    local attributes       = (attribute + somespace^-1 * (((anything-endofattributes)^1)/attribute_specification_error))^0
1022
1023    local parsedtext       = text_parsed   -- / add_text
1024    local unparsedtext     = text_unparsed / add_text
1025    local balanced         = P { "[" * ((anything - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
1026
1027    -- todo: combine empty and begin so that we scan attributes only once .. maybe also go for match time captures
1028
1029    local emptyelement     = (spacing * open         * name * attributes * optionalspace * slash * close) / add_empty
1030    local beginelement     = (spacing * open         * name * attributes * optionalspace         * close) / add_begin
1031    local endelement       = (spacing * open * slash * name              * optionalspace         * close) / add_end
1032
1033--     local commonelement    =  spacing * open         * name * attributes * optionalspace *
1034--     local cemptyelement    = (slash * close) / add_empty
1035--     local cbeginelement    = (      * close) / add_begin
1036
1037    -- todo: combine the opens in:
1038
1039    local begincomment     = open * P("!--")
1040    local endcomment       = P("--") * close
1041    local begininstruction = open * P("?")
1042    local endinstruction   = P("?") * close
1043    local begincdata       = open * P("![CDATA[")
1044    local endcdata         = P("]]") * close
1045
1046    local someinstruction  = C((anything - endinstruction)^0)
1047    local somecomment      = C((anything - endcomment    )^0)
1048    local somecdata        = C((anything - endcdata      )^0)
1049
1050    -- todo: separate dtd parser
1051
1052    local begindoctype     = open * P("!DOCTYPE")
1053    local enddoctype       = close
1054    local beginset         = P("[")
1055    local endset           = P("]")
1056    local wrdtypename      = C((anything-somespace-P(";"))^1)
1057    local doctypename      = C((anything-somespace-close)^0)
1058    local elementdoctype   = optionalspace * P("<!ELEMENT") * (anything-close)^0 * close
1059
1060    local basiccomment     = begincomment * ((anything - endcomment)^0) * endcomment
1061
1062    local weirdentitytype  = P("%") * (somespace * doctypename * somespace * value) / weirdentity
1063    local normalentitytype = (doctypename * somespace * value) / normalentity
1064    local publicentitytype = (doctypename * somespace * P("PUBLIC") * somespace * value) / publicentity
1065
1066    local systementitytype = (doctypename * somespace * P("SYSTEM") * somespace * value * somespace * P("NDATA") * somespace * doctypename)/systementity
1067    local entitydoctype    = optionalspace * P("<!ENTITY") * somespace * (systementitytype + publicentitytype + normalentitytype + weirdentitytype) * optionalspace * close
1068
1069    local publicentityfile = (doctypename * somespace * P("PUBLIC") * somespace * value * (somespace * value)^0) / function(...)
1070        entityfile(entitydoctype,...)
1071    end
1072
1073    local function weirdresolve(s)
1074        lpegmatch(entitydoctype,parameters[s])
1075    end
1076
1077    local function normalresolve(s)
1078        lpegmatch(entitydoctype,entities[s])
1079    end
1080
1081    local entityresolve    = P("%") * (wrdtypename/weirdresolve ) * P(";")
1082                           + P("&") * (wrdtypename/normalresolve) * P(";")
1083
1084    entitydoctype          = entitydoctype + entityresolve
1085
1086    -- we accept comments in doctypes
1087
1088    local doctypeset       = beginset * optionalspace * P(elementdoctype + entitydoctype + entityresolve + basiccomment + space)^0 * optionalspace * endset
1089    local definitiondoctype= doctypename * somespace * doctypeset
1090    local publicdoctype    = doctypename * somespace * P("PUBLIC") * somespace * value * somespace * value * somespace * doctypeset
1091    local systemdoctype    = doctypename * somespace * P("SYSTEM") * somespace * value * somespace * doctypeset
1092    local simpledoctype    = (anything-close)^1 -- * balanced^0
1093    local somedoctype      = C((somespace * (publicentityfile + publicdoctype + systemdoctype + definitiondoctype + simpledoctype) * optionalspace)^0)
1094
1095    local instruction      = (spacing * begininstruction * someinstruction * endinstruction) / function(...) add_special("@pi@",...) end
1096    local comment          = (spacing * begincomment     * somecomment     * endcomment    ) / function(...) add_special("@cm@",...) end
1097    local cdata            = (spacing * begincdata       * somecdata       * endcdata      ) / function(...) add_special("@cd@",...) end
1098    local doctype          = (spacing * begindoctype     * somedoctype     * enddoctype    ) / function(...) add_special("@dt@",...) end
1099
1100    local crap_parsed     = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata - ampersand
1101    local crap_unparsed   = anything - beginelement - endelement - emptyelement - begininstruction - begincomment - begincdata
1102
1103    local parsedcrap      = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
1104    local parsedcrap      = Cs((crap_parsed^1 + entity_text)^1) / handle_crap_error
1105    local unparsedcrap    = Cs((crap_unparsed              )^1) / handle_crap_error
1106
1107    --  nicer but slower:
1108    --
1109    --  local instruction = (Cc("@pi@") * spacing * begininstruction * someinstruction * endinstruction) / add_special
1110    --  local comment     = (Cc("@cm@") * spacing * begincomment     * somecomment     * endcomment    ) / add_special
1111    --  local cdata       = (Cc("@cd@") * spacing * begincdata       * somecdata       * endcdata      ) / add_special
1112    --  local doctype     = (Cc("@dt@") * spacing * begindoctype     * somedoctype     * enddoctype    ) / add_special
1113
1114    local trailer = space^0 * (text_unparsed/set_message)^0
1115
1116    --  comment + emptyelement + text + cdata + instruction + V("parent"), -- 6.5 seconds on 40 MB database file
1117    --  text + comment + emptyelement + cdata + instruction + V("parent"), -- 5.8
1118    --  text + V("parent") + emptyelement + comment + cdata + instruction, -- 5.5
1119
1120    -- local grammar_parsed_text = P { "preamble",
1121    --     preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
1122    --     parent   = beginelement * V("children")^0 * endelement,
1123    --     children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
1124    -- }
1125
1126    local grammar_parsed_text_one = P { "preamble",
1127        preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0,
1128    }
1129
1130    local grammar_parsed_text_two = P { "followup",
1131        followup = V("parent") * trailer,
1132        parent   = beginelement * V("children")^0 * endelement,
1133        children = parsedtext + V("parent") + emptyelement + comment + cdata + instruction + parsedcrap,
1134    }
1135
1136--     local grammar_parsed_text_two = P { "followup",
1137--         followup = beginelement * V("children")^0 * endelement * trailer,
1138--         children = parsedtext + beginelement * V("children")^0 * endelement + emptyelement + comment + cdata + instruction + parsedcrap,
1139--     }
1140
1141-- local grammar_parsed_text_two = P { "followup",
1142--     followup = commonelement * cbeginelement * V("children")^0 * endelement * trailer,
1143--     children = parsedtext + commonelement * (cbeginelement * V("children")^0 * endelement + cemptyelement) + comment + cdata + instruction + parsedcrap,
1144-- }
1145
1146    local grammar_unparsed_text = P { "preamble",
1147        preamble = utfbom^0 * instruction^0 * (doctype + comment + instruction)^0 * V("parent") * trailer,
1148        parent   = beginelement * V("children")^0 * endelement,
1149        children = unparsedtext + V("parent") + emptyelement + comment + cdata + instruction + unparsedcrap,
1150    }
1151
1152    return grammar_parsed_text_one, grammar_parsed_text_two, grammar_unparsed_text
1153
1154end
1155
1156local
1157    grammar_parsed_text_one_nop ,
1158    grammar_parsed_text_two_nop ,
1159    grammar_unparsed_text_nop   = install(space, spacing, anything)
1160
1161local
1162    grammar_parsed_text_one_yes ,
1163    grammar_parsed_text_two_yes ,
1164    grammar_unparsed_text_yes   = install(space_nl, spacing_nl, anything_nl)
1165
1166-- maybe we will add settings to result as well
1167
1168local function _xmlconvert_(data,settings,detail)
1169    settings = settings or { } -- no_root strip_cm_and_dt given_entities parent_root error_handler
1170    preparexmlstate(settings)
1171    if settings.linenumbers then
1172        grammar_parsed_text_one = grammar_parsed_text_one_yes
1173        grammar_parsed_text_two = grammar_parsed_text_two_yes
1174        grammar_unparsed_text   = grammar_unparsed_text_yes
1175    else
1176        grammar_parsed_text_one = grammar_parsed_text_one_nop
1177        grammar_parsed_text_two = grammar_parsed_text_two_nop
1178        grammar_unparsed_text   = grammar_unparsed_text_nop
1179    end
1180    local preprocessor = settings.preprocessor
1181    if data and data ~= "" and type(preprocessor) == "function" then
1182        data = preprocessor(data,settings) or data -- settings.currentresource
1183    end
1184    if settings.parent_root then
1185        mt = getmetatable(settings.parent_root)
1186    else
1187        initialize_mt(top)
1188    end
1189    level = level + 1
1190    stack[level] = top
1191    top.dt = { }
1192    dt = top.dt
1193    nt = 0
1194    if not data or data == "" then
1195        errorstr = "empty xml file"
1196    elseif data == true then
1197        errorstr = detail or "problematic xml file"
1198    elseif utfize or resolve then
1199        local m = lpegmatch(grammar_parsed_text_one,data)
1200        if m then
1201            m = lpegmatch(grammar_parsed_text_two,data,m)
1202        end
1203     -- local m = lpegmatch(grammar_parsed_text,data)
1204        if m then
1205         -- errorstr = "" can be set!
1206        else
1207            errorstr = "invalid xml file - parsed text"
1208        end
1209    elseif type(data) == "string" then
1210        if lpegmatch(grammar_unparsed_text,data) then
1211            errorstr = ""
1212        else
1213            errorstr = "invalid xml file - unparsed text"
1214        end
1215    else
1216        errorstr = "invalid xml file - no text at all"
1217    end
1218    local result
1219    if errorstr and errorstr ~= "" then
1220        result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at = { }, er = true } } }
1221        setmetatable(result, mt)
1222        setmetatable(result.dt[1], mt)
1223        setmetatable(stack, mt)
1224        local errorhandler = settings.error_handler
1225        if errorhandler == false then
1226            -- no error message
1227        else
1228            errorhandler = errorhandler or xml.errorhandler
1229            if errorhandler then
1230                local currentresource = settings.currentresource
1231                if currentresource and currentresource ~= "" then
1232                    xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr),currentresource)
1233                else
1234                    xml.errorhandler(formatters["load error: %s"](errorstr))
1235                end
1236            end
1237        end
1238    else
1239        result = stack[1]
1240    end
1241    if not settings.no_root then
1242        result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={ }, entities = entities, settings = settings }
1243        setmetatable(result, mt)
1244        local rdt = result.dt
1245        for k=1,#rdt do
1246            local v = rdt[k]
1247            if type(v) == "table" and not v.special then -- always table -)
1248                result.ri = k -- rootindex
1249                v.__p__ = result  -- new, experiment, else we cannot go back to settings, we need to test this !
1250                break
1251            end
1252        end
1253    end
1254    if errorstr and errorstr ~= "" then
1255        result.error = true
1256    else
1257        errorstr = nil
1258    end
1259    result.statistics = {
1260        errormessage = errorstr,
1261        entities = {
1262            decimals      = dcache,
1263            hexadecimals  = hcache,
1264            names         = acache,
1265            intermediates = parameters,
1266        }
1267    }
1268    preparexmlstate() -- resets
1269    return result
1270end
1271
1272-- Because we can have a crash (stack issues) with faulty xml, we wrap this one
1273-- in a protector:
1274
1275local function xmlconvert(data,settings)
1276    local ok, result = pcall(function() return _xmlconvert_(data,settings) end)
1277    if ok then
1278        return result
1279    elseif type(result) == "string" then
1280        return _xmlconvert_(true,settings,result)
1281    else
1282        return _xmlconvert_(true,settings)
1283    end
1284end
1285
1286xml.convert = xmlconvert
1287
1288function xml.inheritedconvert(data,xmldata,cleanup) -- xmldata is parent
1289    local settings = xmldata.settings
1290    if settings then
1291        settings.parent_root = xmldata -- to be tested
1292    end
1293 -- settings.no_root = true
1294    local xc = xmlconvert(data,settings) -- hm, we might need to locate settings
1295    if cleanup then
1296        local x = xc.dt
1297        if x then
1298            x = x[1]
1299            if x and x.tg == "@pi@" then
1300                local dt = x.dt
1301                local pi = dt and dt[1]
1302                if type(pi) == "string" and find(pi,"^xml") then
1303                    remove(dt,1)
1304                end
1305            end
1306        end
1307    end
1308 -- xc.settings = nil
1309 -- xc.entities = nil
1310 -- xc.special = nil
1311 -- xc.ri = nil
1312 -- print(xc.tg)
1313    return xc
1314end
1315
1316-- Packaging data in an xml like table is done with the following function. Maybe it
1317-- will go away (when not used).
1318
1319function xml.is_valid(root)
1320    return root and root.dt and root.dt[1] and type(root.dt[1]) == "table" and not root.dt[1].er
1321end
1322
1323function xml.package(tag,attributes,data)
1324    local ns, tg = match(tag,"^(.-):?([^:]+)$")
1325    local t = { ns = ns, tg = tg, dt = data or "", at = attributes or {} }
1326    setmetatable(t, mt)
1327    return t
1328end
1329
1330function xml.is_valid(root)
1331    return root and not root.error
1332end
1333
1334xml.errorhandler = report_xml
1335
1336-- We cannot load an LPEG from a filehandle so we need to load the whole file first.
1337-- The function accepts a string representing a filename or a file handle.
1338
1339function xml.load(filename,settings)
1340    local data = ""
1341    if type(filename) == "string" then
1342     -- local data = io.loaddata(filename) -- todo: check type in io.loaddata
1343        local f = io.open(filename,'r') -- why not 'rb'
1344        if f then
1345            data = f:read("*all") -- io.readall(f) ... only makes sense for large files
1346            f:close()
1347        end
1348    elseif filename then -- filehandle
1349        data = filename:read("*all") -- io.readall(f) ... only makes sense for large files
1350    end
1351    if settings then
1352        settings.currentresource = filename
1353        local result = xmlconvert(data,settings)
1354        settings.currentresource = nil
1355        return result
1356    else
1357        return xmlconvert(data,{ currentresource = filename })
1358    end
1359end
1360
1361-- When we inject new elements, we need to convert strings to valid trees, which is
1362-- what the next function does.
1363
1364local no_root = { no_root = true }
1365
1366function xml.toxml(data)
1367    if type(data) == "string" then
1368        local root = { xmlconvert(data,no_root) }
1369        return (#root > 1 and root) or root[1]
1370    else
1371        return data
1372    end
1373end
1374
1375-- For copying a tree we use a dedicated function instead of the generic table
1376-- copier. Since we know what we're dealing with we can speed up things a bit. The
1377-- second argument is not to be used!
1378
1379-- local function copy(old)
1380--     if old then
1381--         local new = { }
1382--         for k,v in next, old do
1383--             if type(v) == "table" then
1384--                 new[k] = table.copy(v)
1385--             else
1386--                 new[k] = v
1387--             end
1388--         end
1389--         local mt = getmetatable(old)
1390--         if mt then
1391--             setmetatable(new,mt)
1392--         end
1393--         return new
1394--     else
1395--         return { }
1396--     end
1397-- end
1398--
1399-- We need to prevent __p__ recursion, so:
1400
1401local function copy(old,p)
1402    if old then
1403        local new = { }
1404        for k, v in next, old do
1405            local t = type(v) == "table"
1406            if k == "at" then
1407                local t = { }
1408                for k, v in next, v do
1409                    t[k] = v
1410                end
1411                new[k] = t
1412            elseif k == "dt" then
1413                v.__p__ = nil
1414                local t = { }
1415                for i=1,#v do
1416                    local vi = v[i]
1417                    if type(vi) == "table" then
1418                        t[i] = copy(vi,new)
1419                    else
1420                        t[i] = vi
1421                    end
1422                end
1423                new[k] = t
1424                t.__p__ = p
1425            else
1426                new[k] = v -- so we also share entities, etc in root
1427            end
1428        end
1429        local mt = getmetatable(old)
1430        if mt then
1431            setmetatable(new,mt)
1432        end
1433        return new
1434    else
1435        return { }
1436    end
1437end
1438
1439xml.copy = copy
1440
1441-- In ConTeXt serializing the tree or parts of the tree is a major actitivity which
1442-- is why the following function is pretty optimized resulting in a few more lines
1443-- of code than needed. The variant that uses the formatting function for all
1444-- components is about 15% slower than the concatinating alternative.
1445
1446-- todo: add <?xml version='1.0' standalone='yes'?> when not present
1447
1448function xml.checkbom(root) -- can be made faster
1449    if root.ri then
1450        local dt = root.dt
1451        for k=1,#dt do
1452            local v = dt[k]
1453            if type(v) == "table" and v.special and v.tg == "@pi@" and find(v.dt[1],"xml.*version=") then
1454                return
1455            end
1456        end
1457        insert(dt, 1, { special = true, ns = "", tg = "@pi@", dt = { "xml version='1.0' standalone='yes'" } } )
1458        insert(dt, 2, "\n" )
1459    end
1460end
1461
1462-- At the cost of some 25% runtime overhead you can first convert the tree to a
1463-- string and then handle the lot.
1464
1465-- new experimental reorganized serialize
1466
1467local f_attribute = formatters['%s=%q']
1468
1469-- we could reuse ats .. for high performance we could also
1470-- have a multiple handle calls instead of multiple arguments
1471-- but it's not that critical
1472
1473local function verbose_element(e,handlers,escape) -- options
1474    local handle = handlers.handle
1475    local serialize = handlers.serialize
1476    local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn
1477    local ats = eat and next(eat) and { }
1478    if ats then
1479        -- we now sort attributes
1480        local n = 0
1481        for k in next, eat do
1482            n = n + 1
1483            ats[n] = k
1484        end
1485        if n == 1 then
1486            local k = ats[1]
1487            ats = f_attribute(k,escaped(eat[k]))
1488        else
1489            sort(ats)
1490            for i=1,n do
1491                local k = ats[i]
1492                ats[i] = f_attribute(k,escaped(eat[k]))
1493            end
1494            ats = concat(ats," ")
1495        end
1496    end
1497    if ern and trace_entities and ern ~= ens then
1498        ens = ern
1499    end
1500    local n = edt and #edt
1501    if ens ~= "" then
1502        if n and n > 0 then
1503            if ats then
1504                handle("<",ens,":",etg," ",ats,">")
1505            else
1506                handle("<",ens,":",etg,">")
1507            end
1508            for i=1,n do
1509                local e = edt[i]
1510                if type(e) == "string" then
1511                    handle(escaped(e))
1512                else
1513                    serialize(e,handlers)
1514                end
1515            end
1516            handle("</",ens,":",etg,">")
1517        else
1518            if ats then
1519                handle("<",ens,":",etg," ",ats,"/>")
1520            else
1521                handle("<",ens,":",etg,"/>")
1522            end
1523        end
1524    else
1525        if n and n > 0 then
1526            if ats then
1527                handle("<",etg," ",ats,">")
1528            else
1529                handle("<",etg,">")
1530            end
1531            for i=1,n do
1532                local e = edt[i]
1533                if type(e) == "string" then
1534                    handle(escaped(e)) -- option: hexify escaped entities
1535                else
1536                    serialize(e,handlers)
1537                end
1538            end
1539            handle("</",etg,">")
1540        else
1541            if ats then
1542                handle("<",etg," ",ats,"/>")
1543            else
1544                handle("<",etg,"/>")
1545            end
1546        end
1547    end
1548end
1549
1550local function verbose_pi(e,handlers)
1551    handlers.handle("<?",e.dt[1],"?>")
1552end
1553
1554local function verbose_comment(e,handlers)
1555    handlers.handle("<!--",e.dt[1],"-->")
1556end
1557
1558local function verbose_cdata(e,handlers)
1559    handlers.handle("<![CDATA[", e.dt[1],"]]>")
1560end
1561
1562local function verbose_doctype(e,handlers)
1563    handlers.handle("<!DOCTYPE",e.dt[1],">") -- has space at end of string
1564end
1565
1566local function verbose_root(e,handlers)
1567    handlers.serialize(e.dt,handlers)
1568end
1569
1570local function verbose_text(e,handlers)
1571    handlers.handle(escaped(e))
1572end
1573
1574local function verbose_document(e,handlers)
1575    local serialize = handlers.serialize
1576    local functions = handlers.functions
1577    for i=1,#e do
1578        local ei = e[i]
1579        if type(ei) == "string" then
1580            functions["@tx@"](ei,handlers)
1581        else
1582            serialize(ei,handlers)
1583        end
1584    end
1585end
1586
1587local function serialize(e,handlers,...)
1588    if e then
1589        local initialize = handlers.initialize
1590        local finalize   = handlers.finalize
1591        local functions  = handlers.functions
1592        if initialize then
1593            local state = initialize(...)
1594            if not state == true then
1595                return state
1596            end
1597        end
1598        local etg = e.tg
1599        if etg then
1600            (functions[etg] or functions["@el@"])(e,handlers)
1601     -- elseif type(e) == "string" then
1602     --     functions["@tx@"](e,handlers)
1603        else
1604            functions["@dc@"](e,handlers) -- dc ?
1605        end
1606        if finalize then
1607            return finalize()
1608        end
1609    end
1610end
1611
1612local function xserialize(e,handlers)
1613    if e then
1614        local functions = handlers.functions
1615        local etg = e.tg
1616        if etg then
1617            (functions[etg] or functions["@el@"])(e,handlers)
1618     -- elseif type(e) == "string" then
1619     --     functions["@tx@"](e,handlers)
1620        else
1621            functions["@dc@"](e,handlers)
1622        end
1623    end
1624end
1625
1626local handlers = { }
1627
1628local function newhandlers(settings)
1629    local t = table.copy(handlers[settings and settings.parent or "verbose"] or { }) -- merge
1630    if settings then
1631        for k,v in next, settings do
1632            if type(v) == "table" then
1633                local tk = t[k] if not tk then tk = { } t[k] = tk end
1634                for kk, vv in next, v do
1635                    tk[kk] = vv
1636                end
1637            else
1638                t[k] = v
1639            end
1640        end
1641        if settings.name then
1642            handlers[settings.name] = t
1643        end
1644    end
1645    utilities.storage.mark(t)
1646    return t
1647end
1648
1649local nofunction = function() end
1650
1651function xml.sethandlersfunction(handler,name,fnc)
1652    handler.functions[name] = fnc or nofunction
1653end
1654
1655function xml.gethandlersfunction(handler,name)
1656    return handler.functions[name]
1657end
1658
1659function xml.gethandlers(name)
1660    return handlers[name]
1661end
1662
1663newhandlers {
1664    name       = "verbose",
1665    initialize = false, -- faster than nil and mt lookup
1666    finalize   = false, -- faster than nil and mt lookup
1667    serialize  = xserialize,
1668    handle     = print,
1669    functions  = {
1670        ["@dc@"]   = verbose_document,
1671        ["@dt@"]   = verbose_doctype,
1672        ["@rt@"]   = verbose_root,
1673        ["@el@"]   = verbose_element,
1674        ["@pi@"]   = verbose_pi,
1675        ["@cm@"]   = verbose_comment,
1676        ["@cd@"]   = verbose_cdata,
1677        ["@tx@"]   = verbose_text,
1678    }
1679}
1680
1681
1682-- How you deal with saving data depends on your preferences. For a 40 MB database
1683-- file the timing on a 2.3 Core Duo are as follows (time in seconds):
1684--
1685-- 1.3 : load data from file to string
1686-- 6.1 : convert string into tree
1687-- 5.3 : saving in file using xmlsave
1688-- 6.8 : converting to string using xml.tostring
1689-- 3.6 : saving converted string in file
1690--
1691-- Beware, these were timing with the old routine but measurements will not be that
1692-- much different I guess.
1693
1694-- maybe this will move to lxml-xml
1695
1696local result
1697
1698local xmlfilehandler = newhandlers {
1699    name       = "file",
1700    initialize = function(name)
1701        result = io.open(name,"wb")
1702        return result
1703    end,
1704    finalize   = function()
1705        result:close()
1706        return true
1707    end,
1708    handle     = function(...)
1709        result:write(...)
1710    end,
1711}
1712
1713-- no checking on writeability here but not faster either
1714--
1715-- local xmlfilehandler = newhandlers {
1716--     initialize = function(name)
1717--         io.output(name,"wb")
1718--         return true
1719--     end,
1720--     finalize   = function()
1721--         io.close()
1722--         return true
1723--     end,
1724--     handle     = io.write,
1725-- }
1726
1727function xml.save(root,name)
1728    serialize(root,xmlfilehandler,name)
1729end
1730
1731-- local result
1732--
1733-- local xmlstringhandler = newhandlers {
1734--     name       = "string",
1735--     initialize = function()
1736--         result = { }
1737--         return result
1738--     end,
1739--     finalize   = function()
1740--         return concat(result)
1741--     end,
1742--     handle     = function(...)
1743--         result[#result+1] = concat { ... }
1744--     end,
1745-- }
1746
1747local result, r, threshold = { }, 0, 512
1748
1749local xmlstringhandler = newhandlers {
1750    name       = "string",
1751    initialize = function()
1752        r = 0
1753        return result
1754    end,
1755    finalize   = function()
1756        local done = concat(result,"",1,r)
1757        r = 0
1758        if r > threshold then
1759            result = { }
1760        end
1761        return done
1762    end,
1763    handle     = function(...)
1764        for i=1,select("#",...) do
1765            r = r + 1
1766            result[r] = select(i,...)
1767        end
1768    end,
1769}
1770
1771local function xmltostring(root) -- 25% overhead due to collecting
1772    if not root then
1773        return ""
1774    elseif type(root) == "string" then
1775        return root
1776    else -- if next(root) then -- next is faster than type (and >0 test)
1777        return serialize(root,xmlstringhandler) or ""
1778    end
1779end
1780
1781local function __tostring(root) -- inline
1782    return (root and xmltostring(root)) or ""
1783end
1784
1785initialize_mt = function(root) -- redefinition
1786    mt = { __tostring = __tostring, __index = root }
1787end
1788
1789xml.defaulthandlers = handlers
1790xml.newhandlers     = newhandlers
1791xml.serialize       = serialize
1792xml.tostring        = xmltostring
1793
1794-- The next function operated on the content only and needs a handle function that
1795-- accepts a string.
1796
1797local function xmlstring(e,handle)
1798    if not handle or (e.special and e.tg ~= "@rt@") then
1799        -- nothing
1800    elseif e.tg then
1801        local edt = e.dt
1802        if edt then
1803            for i=1,#edt do
1804                xmlstring(edt[i],handle)
1805            end
1806        end
1807    else
1808        handle(e)
1809    end
1810end
1811
1812xml.string = xmlstring
1813
1814-- A few helpers:
1815
1816--~ xmlsetproperty(root,"settings",settings)
1817
1818function xml.settings(e)
1819    while e do
1820        local s = e.settings
1821        if s then
1822            return s
1823        else
1824            e = e.__p__
1825        end
1826    end
1827    return nil
1828end
1829
1830function xml.root(e)
1831    local r = e
1832    while e do
1833        e = e.__p__
1834        if e then
1835            r = e
1836        end
1837    end
1838    return r
1839end
1840
1841function xml.parent(root)
1842    return root.__p__
1843end
1844
1845function xml.body(root)
1846    return root.ri and root.dt[root.ri] or root -- not ok yet
1847end
1848
1849function xml.name(root)
1850    if not root then
1851        return ""
1852    end
1853    local ns = root.ns
1854    local tg = root.tg
1855    if ns == "" then
1856        return tg
1857    else
1858        return ns .. ":" .. tg
1859    end
1860end
1861
1862-- The next helper erases an element but keeps the table as it is, and since empty
1863-- strings are not serialized (effectively) it does not harm. Copying the table
1864-- would take more time.
1865
1866function xml.erase(dt,k)
1867    if dt then
1868        if k then
1869            dt[k] = ""
1870        else for k=1,#dt do
1871            dt[1] = { "" }
1872        end end
1873    end
1874end
1875
1876-- The next helper assigns a tree (or string). Usage:
1877--
1878--   dt[k] = xml.assign(root) or xml.assign(dt,k,root)
1879
1880function xml.assign(dt,k,root)
1881    if dt and k then
1882        dt[k] = type(root) == "table" and xml.body(root) or root
1883        return dt[k]
1884    else
1885        return xml.body(root)
1886    end
1887end
1888
1889-- The next helper assigns a tree (or string). Usage:
1890--
1891--   xml.tocdata(e)
1892--   xml.tocdata(e,"error")
1893
1894function xml.tocdata(e,wrapper) -- a few more in the aux module
1895    local whatever = type(e) == "table" and xmltostring(e.dt) or e or ""
1896    if wrapper then
1897        whatever = formatters["<%s>%s</%s>"](wrapper,whatever,wrapper)
1898    end
1899    local t = { special = true, ns = "", tg = "@cd@", at = { }, rn = "", dt = { whatever }, __p__ = e }
1900    setmetatable(t,getmetatable(e))
1901    e.dt = { t }
1902end
1903
1904function xml.makestandalone(root)
1905    if root.ri then
1906        local dt = root.dt
1907        for k=1,#dt do
1908            local v = dt[k]
1909            if type(v) == "table" and v.special and v.tg == "@pi@" then
1910                local txt = v.dt[1]
1911                if find(txt,"xml.*version=") then
1912                    v.dt[1] = txt .. " standalone='yes'"
1913                    break
1914                end
1915            end
1916        end
1917    end
1918    return root
1919end
1920
1921function xml.kind(e)
1922    local dt = e and e.dt
1923    if dt then
1924        local n = #dt
1925        if n == 1 then
1926            local d = dt[1]
1927            if d.special then
1928                local tg = d.tg
1929                if tg == "@cd@" then
1930                    return "cdata"
1931                elseif tg == "@cm@" then
1932                    return "comment"
1933                elseif tg == "@pi@" then
1934                    return "instruction"
1935                elseif tg == "@dt@" then
1936                    return "declaration"
1937                end
1938            elseif type(d) == "string" then
1939                return "text"
1940            end
1941            return "element"
1942        elseif n > 0 then
1943            return "mixed"
1944        end
1945    end
1946    return "empty"
1947end
1948