s-xml-analyzers.lua /size: 9 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['s-xml-analyzers'] = {
2    version   = 1.001,
3    comment   = "companion to s-xml-analyzers.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9moduledata.xml           = moduledata.xml           or { }
10moduledata.xml.analyzers = moduledata.xml.analyzers or { }
11
12local next, type = next, type
13local utfvalues = string.utfvalues
14local formatters = string.formatters
15local setmetatableindex = table.setmetatableindex
16local context = context
17local ctxescaped = context.ctxescaped
18local NC, NR, HL, FL, LL, SL, TB = context.NC, context.NR, context.HL, context.TB, context.FL, context.LL, context.SL
19local sortedhash, sortedkeys, concat, sequenced = table.sortedhash, table.sortedkeys, table.concat, table.sequenced
20
21local chardata = characters.data
22
23local tags = { }
24local char = { }
25local attr = { }
26local ents = { }
27local name = nil
28
29local function analyze(filename)
30
31    if type(filename) == "string" then
32        filename = { filename }
33    end
34
35    table.sort(filename)
36
37    local hash = concat(filename,"|")
38
39    if hash == name then
40        return
41    end
42
43    name = hash
44    tags = { }
45    char = { }
46    attr = { }
47    ents = { }
48
49    local function att(t,k)
50        local v = setmetatableindex("number")
51        t[k] = v
52        return v
53    end
54
55    local function add(t,k)
56        local v = {
57            n          = 0,
58            attributes = setmetatableindex(att),
59            children   = setmetatableindex(add),
60        }
61        t[k] = v
62        return v
63    end
64
65    setmetatableindex(tags,add)
66
67    setmetatableindex(ents,"number")
68    setmetatableindex(char,"number")
69
70    setmetatableindex(attr,function(t,k)
71        char[k] = char[k] or 0
72        t[k] = 0
73        return 0
74    end)
75
76    local function collect(e,parent)
77        local dt = e.dt
78        if e.special then
79            if dt then
80                for i=1,#dt do
81                    local d = dt[i]
82                    if type(d) == "table" then
83                        collect(d,tg)
84                    end
85                end
86            end
87        else
88            local at = e.at
89            local tg = e.tg
90            local tag = tags[tg]
91            tag.n = tag.n + 1
92            local children = parent and tags[parent].children[tg]
93            local childatt = children and children.attributes
94            if children then
95                children.n = children.n + 1
96            end
97            if at then
98                local attributes = tag.attributes
99                for k, v in next, at do
100                    local a = attributes[k]
101                    a[v] = a[v] + 1
102                    if childatt then
103                        local a = childatt[k]
104                        a[v] = a[v] + 1
105                    end
106                    for s in utfvalues(v) do
107                        attr[s] = attr[s] + 1
108                    end
109                end
110            end
111            if dt then
112                for i=1,#dt do
113                    local d = dt[i]
114                    if type(d) == "table" then
115                        collect(d,tg)
116                    else
117                        for s in utfvalues(d) do
118                            char[s] = char[s] + 1
119                        end
120                    end
121                end
122            end
123        end
124    end
125
126    for i=1,#filename do
127        local name = filename[i]
128        local root = xml.load(name)
129        --
130        logs.report("xml analyze","loaded: %s",name)
131        --
132        collect(root)
133        --
134        local names = root.statistics.entities.names
135        for n in next, names  do
136            ents[n] = ents[n] + 1
137        end
138    end
139
140    setmetatableindex(tags,nil)
141    setmetatableindex(char,nil)
142    setmetatableindex(attr,nil)
143    setmetatableindex(ents,nil)
144
145end
146
147moduledata.xml.analyzers.maxnofattributes = 100
148
149function moduledata.xml.analyzers.structure(filename)
150    analyze(filename)
151    local done = false
152    local maxnofattributes = tonumber(moduledata.xml.analyzers.maxnofattributes) or 100
153    context.starttabulate { "|l|pA{nothyphenated,flushleft,verytolerant,stretch,broad}|" }
154    for name, data in table.sortedhash(tags) do
155        if done then
156            context.TB()
157        else
158            done = true
159        end
160        local children   = data.children
161        local attributes = data.attributes
162        NC() context.bold("element")
163        NC() context.darkred(name)
164        NC() NR()
165        NC() context.bold("frequency")
166        NC() context(data.n)
167        NC() NR()
168        if next(children) then
169            local t = { }
170            for k, v in next, children do
171                t[k] = v.n
172            end
173            NC() context.bold("children") NC() context.puretext(sequenced(t)) NC() NR()
174        end
175        if next(attributes) then
176            NC() context.bold("attributes") NC() context.puretext.darkgreen(concat(sortedkeys(attributes)," ")) NC() NR()
177            for attribute, values in sortedhash(attributes) do
178                local n = table.count(values)
179                if attribute == "id" or attribute == "xml:id" or n > maxnofattributes then
180                    NC() context("@%s",attribute) NC() context("%s different values",n) NC() NR()
181                else
182                    NC() context("@%s",attribute) NC() context.puretext(sequenced(values)) NC() NR()
183                end
184            end
185        end
186    end
187    context.stoptabulate()
188end
189
190function moduledata.xml.analyzers.characters(filename)
191    analyze(filename)
192    context.starttabulate { "|r|r|l|c|l|" }
193    for c, n in table.sortedhash(char) do
194        NC() context.color({ "darkred" }, n)
195        NC() context.color({ "darkgreen" }, attr[c] or "")
196        NC() context("%U",c)
197        NC() context.char(c)
198        NC() context("%s",chardata[c].description)
199        NC() NR()
200    end
201    context.stoptabulate()
202end
203
204function moduledata.xml.analyzers.entities(filename)
205    analyze(filename)
206    context.starttabulate { "|l|r|" }
207    for e, n in table.sortedhash(ents) do
208        NC() context(e)
209        NC() context(n)
210        NC() NR()
211    end
212    context.stoptabulate()
213end
214
215local f_parent_s = formatters["xml:%s"]
216local f_parent_n = formatters["\\startxmlsetups xml:%s\n  \\xmlflush{#1}\n\\stopxmlsetups"]
217local f_parent_a = formatters["\\startxmlsetups xml:%s\n  %% @ % t\n  \\xmlflush{#1}\n\\stopxmlsetups"]
218local f_child_s  = formatters["xml:%s:%s"]
219local f_child_n  = formatters["\\startxmlsetups xml:%s:%s\n  \\xmlflush{#1}\n\\stopxmlsetups"]
220local f_child_a  = formatters["\\startxmlsetups xml:%s:%s\n  %% @ % t\n  \\xmlflush{#1}\n\\stopxmlsetups"]
221
222local f_template = formatters [ [[
223%% file: %s
224
225%% Beware, these are all (first level) setups. If you have a complex document
226%% it often makes sense to use \\xmlfilter or similar local filter options.
227
228%% presets
229
230\startxmlsetups xml:presets:all
231  \xmlsetsetup {#1} {
232    %s
233  }
234\stopxmlsetups
235
236%% setups
237
238\xmlregistersetup{xml:presets:all}
239
240\starttext
241    \xmlprocessfile{main}{somefile.xml}{}
242\stoptext
243
244%s
245]] ]
246
247function moduledata.xml.analyzers.allsetups(filename,usedname)
248    analyze(filename)
249    local result = { }
250    local setups = { }
251    for name, data in table.sortedhash(tags) do
252        local children   = data.children
253        local attributes = data.attributes
254        if next(attributes) then
255            result[#result+1] = f_parent_a(name,sortedkeys(attributes))
256        else
257            result[#result+1] = f_parent_n(name)
258        end
259        setups[#setups+1] = f_parent_s(name)
260        if next(children) then
261            for k, v in sortedhash(children) do
262                local attributes = v.attributes
263                if next(attributes) then
264                    result[#result+1] = f_child_a(name,k,sortedkeys(attributes))
265                else
266                    result[#result+1] = f_child_n(name,k)
267                end
268                setups[#setups+1] = f_child_s(name,k)
269            end
270        end
271    end
272    table.sort(setups)
273    --
274    if type(filename) == "table" then
275        filename = concat(filename," | ")
276    end
277    --
278    usedname = usedname or "xml-analyze-template.tex"
279    --
280    io.savedata(usedname,f_template(filename,concat(setups,"|\n    "),concat(result,"\n\n")))
281    logs.report("xml analyze","presets saved in: %s",usedname)
282end
283
284-- example:
285
286-- local t = { }
287-- local x = xml.load("music-collection.xml")
288-- for c in xml.collected(x,"//*") do
289--     if not c.special and not t[c.tg] then
290--         t[c.tg] = true
291--     end
292-- end
293-- inspect(table.sortedkeys(t))
294
295-- xml.finalizers.taglist = function(collected)
296--     local t = { }
297--     for i=1,#collected do
298--         local c = collected[i]
299--         if not c.special then
300--             local tg = c.tg
301--             if tg and not t[tg] then
302--                 t[tg] = true
303--             end
304--         end
305--     end
306--     return t
307-- end
308-- local x = xml.load("music-collection.xml")
309-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist()")))
310
311-- xml.finalizers.taglist = function(collected,parenttoo)
312--     local t = { }
313--     for i=1,#collected do
314--         local c = collected[i]
315--         if not c.special then
316--             local tg = c.tg
317--             if tg and not t[tg] then
318--                 t[tg] = true
319--             end
320--             if parenttoo then
321--                 local p = c.__p__
322--                 if p and not p.special then
323--                     local tg = p.tg .. ":" .. tg
324--                     if tg and not t[tg] then
325--                         t[tg] = true
326--                     end
327--                 end
328--             end
329--         end
330--     end
331--     return t
332-- end
333
334-- local x = xml.load("music-collection.xml")
335-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist()")))
336
337-- local x = xml.load("music-collection.xml")
338-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist(true)")))
339