s-xml-analyzers.lua /size: 9909 b    last modification: 2020-07-01 14:35
1if not modules then modules = { } end modules ['s-xml-analyzers'] = {
2    version   = 1.001,
3    comment   = "companion to s-xml-analyzers.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9moduledata.xml           = moduledata.xml           or { }
10moduledata.xml.analyzers = moduledata.xml.analyzers or { }
11
12local next, type = next, type
13local utfvalues = string.utfvalues
14local formatters = string.formatters
15local setmetatableindex = table.setmetatableindex
16local context = context
17local NC, NR, HL, FL, LL, SL, TB = context.NC, context.NR, context.HL, context.TB, context.FL, context.LL, context.SL
18local sortedhash, sortedkeys, concat, sequenced = table.sortedhash, table.sortedkeys, table.concat, table.sequenced
19
20local chardata = characters.data
21
22local tags = { }
23local char = { }
24local attr = { }
25local ents = { }
26local name = nil
27
28local function analyze(filename)
29
30    if type(filename) == "string" then
31        filename = { filename }
32    end
33
34    table.sort(filename)
35
36    local hash = concat(filename,"|")
37
38    if hash == name then
39        return
40    end
41
42    name = hash
43    tags = { }
44    char = { }
45    attr = { }
46    ents = { }
47
48    local function att(t,k)
49        local v = setmetatableindex("number")
50        t[k] = v
51        return v
52    end
53
54    local function add(t,k)
55        local v = {
56            n          = 0,
57            attributes = setmetatableindex(att),
58            children   = setmetatableindex(add),
59        }
60        t[k] = v
61        return v
62    end
63
64    setmetatableindex(tags,add)
65
66    setmetatableindex(ents,"number")
67    setmetatableindex(char,"number")
68
69    setmetatableindex(attr,function(t,k)
70        char[k] = char[k] or 0
71        t[k] = 0
72        return 0
73    end)
74
75    local function collect(e,parent)
76        local dt = e.dt
77        if e.special then
78            if dt then
79                for i=1,#dt do
80                    local d = dt[i]
81                    if type(d) == "table" then
82                        collect(d,tg)
83                    end
84                end
85            end
86        else
87            local at = e.at
88            local tg = e.tg
89            local tag = tags[tg]
90            tag.n = tag.n + 1
91            local children = parent and tags[parent].children[tg]
92            local childatt = children and children.attributes
93            if children then
94                children.n = children.n + 1
95            end
96            if at then
97                local attributes = tag.attributes
98                for k, v in next, at do
99                    local a = attributes[k]
100                    a[v] = a[v] + 1
101                    if childatt then
102                        local a = childatt[k]
103                        a[v] = a[v] + 1
104                    end
105                    for s in utfvalues(v) do
106                        attr[s] = attr[s] + 1
107                    end
108                end
109            end
110            if dt then
111                for i=1,#dt do
112                    local d = dt[i]
113                    if type(d) == "table" then
114                        collect(d,tg)
115                    else
116                        for s in utfvalues(d) do
117                            char[s] = char[s] + 1
118                        end
119                    end
120                end
121            end
122        end
123    end
124
125    for i=1,#filename do
126        local name = filename[i]
127        local root = xml.load(name)
128        --
129        logs.report("xml analyze","loaded: %s",name)
130        --
131        collect(root)
132        --
133        local names = root.statistics.entities.names
134        for n in next, names  do
135            ents[n] = ents[n] + 1
136        end
137    end
138
139    setmetatableindex(tags,nil)
140    setmetatableindex(char,nil)
141    setmetatableindex(attr,nil)
142    setmetatableindex(ents,nil)
143
144end
145
146moduledata.xml.analyzers.maxnofattributes = 100
147
148function moduledata.xml.analyzers.structure(filename)
149    analyze(filename)
150    local done = false
151    local maxnofattributes = tonumber(moduledata.xml.analyzers.maxnofattributes) or 100
152    context.starttabulate { "|l|pA{nothyphenated,flushleft,verytolerant,stretch,broad}|" }
153    for name, data in table.sortedhash(tags) do
154        if done then
155            context.TB()
156        else
157            done = true
158        end
159        local children   = data.children
160        local attributes = data.attributes
161        NC() context.bold("element") NC() context.darkred(name) NC() NR()
162        NC() context.bold("frequency") NC() context(data.n) NC() NR()
163        if next(children) then
164            local t = { }
165            for k, v in next, children do
166                t[k] = v.n
167            end
168            NC() context.bold("children") NC() context.puretext(sequenced(t)) NC() NR()
169        end
170        if next(attributes) then
171            NC() context.bold("attributes") NC() context.puretext.darkgreen(concat(sortedkeys(attributes)," ")) NC() NR()
172            for attribute, values in sortedhash(attributes) do
173                local n = table.count(values)
174                if attribute == "id" or attribute == "xml:id" or n > maxnofattributes then
175                    NC() context("@%s",attribute) NC() context("%s different values",n) NC() NR()
176                else
177                    NC() context("@%s",attribute) NC() context.puretext(sequenced(values)) NC() NR()
178                end
179            end
180        end
181    end
182    context.stoptabulate()
183end
184
185function moduledata.xml.analyzers.characters(filename)
186    analyze(filename)
187    context.starttabulate { "|r|r|l|c|l|" }
188    for c, n in table.sortedhash(char) do
189        NC() context.darkred("%s",n)
190        NC() context.darkgreen("%s",attr[c])
191        NC() context("%U",c)
192        NC() context.char(c)
193        NC() context("%s",chardata[c].description)
194        NC() NR()
195    end
196    context.stoptabulate()
197end
198
199function moduledata.xml.analyzers.entities(filename)
200    analyze(filename)
201    context.starttabulate { "|l|r|" }
202    for e, n in table.sortedhash(ents) do
203        NC() context(e)
204        NC() context(n)
205        NC() NR()
206    end
207    context.stoptabulate()
208end
209
210local f_parent_s = formatters["xml:%s"]
211local f_parent_n = formatters["\\startxmlsetups xml:%s\n  \\xmlflush{#1}\n\\stopxmlsetups"]
212local f_parent_a = formatters["\\startxmlsetups xml:%s\n  %% @ % t\n  \\xmlflush{#1}\n\\stopxmlsetups"]
213local f_child_s  = formatters["xml:%s:%s"]
214local f_child_n  = formatters["\\startxmlsetups xml:%s:%s\n  \\xmlflush{#1}\n\\stopxmlsetups"]
215local f_child_a  = formatters["\\startxmlsetups xml:%s:%s\n  %% @ % t\n  \\xmlflush{#1}\n\\stopxmlsetups"]
216
217local f_template = formatters [ [[
218%% file: %s
219
220%% Beware, these are all (first level) setups. If you have a complex document
221%% it often makes sense to use \\xmlfilter or similar local filter options.
222
223%% presets
224
225\startxmlsetup xml:presets:all
226  \xmlsetsetups {#1} {
227    %s
228  }
229\stopxmlsetups
230
231%% setups
232
233\xmlregistersetup{xml:presets:all}
234
235\starttext
236    \xmlprocessfile{main}{somefile.xml}{}
237\stoptext
238
239%s
240]] ]
241
242function moduledata.xml.analyzers.allsetups(filename,usedname)
243    analyze(filename)
244    local result = { }
245    local setups = { }
246    for name, data in table.sortedhash(tags) do
247        local children   = data.children
248        local attributes = data.attributes
249        if next(attributes) then
250            result[#result+1] = f_parent_a(name,sortedkeys(attributes))
251        else
252            result[#result+1] = f_parent_n(name)
253        end
254        setups[#setups+1] = f_parent_s(name)
255        if next(children) then
256            for k, v in sortedhash(children) do
257                local attributes = v.attributes
258                if next(attributes) then
259                    result[#result+1] = f_child_a(name,k,sortedkeys(attributes))
260                else
261                    result[#result+1] = f_child_n(name,k)
262                end
263                setups[#setups+1] = f_child_s(name,k)
264            end
265        end
266    end
267    table.sort(setups)
268    --
269    if type(filename) == "table" then
270        filename = concat(filename," | ")
271    end
272    --
273    usedname = usedname or "xml-analyze-template.tex"
274    --
275    io.savedata(usedname,f_template(filename,concat(setups,"|\n    "),concat(result,"\n\n")))
276    logs.report("xml analyze","presets saved in: %s",usedname)
277end
278
279-- example:
280
281-- local t = { }
282-- local x = xml.load("music-collection.xml")
283-- for c in xml.collected(x,"//*") do
284--     if not c.special and not t[c.tg] then
285--         t[c.tg] = true
286--     end
287-- end
288-- inspect(table.sortedkeys(t))
289
290-- xml.finalizers.taglist = function(collected)
291--     local t = { }
292--     for i=1,#collected do
293--         local c = collected[i]
294--         if not c.special then
295--             local tg = c.tg
296--             if tg and not t[tg] then
297--                 t[tg] = true
298--             end
299--         end
300--     end
301--     return t
302-- end
303-- local x = xml.load("music-collection.xml")
304-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist()")))
305
306-- xml.finalizers.taglist = function(collected,parenttoo)
307--     local t = { }
308--     for i=1,#collected do
309--         local c = collected[i]
310--         if not c.special then
311--             local tg = c.tg
312--             if tg and not t[tg] then
313--                 t[tg] = true
314--             end
315--             if parenttoo then
316--                 local p = c.__p__
317--                 if p and not p.special then
318--                     local tg = p.tg .. ":" .. tg
319--                     if tg and not t[tg] then
320--                         t[tg] = true
321--                     end
322--                 end
323--             end
324--         end
325--     end
326--     return t
327-- end
328
329-- local x = xml.load("music-collection.xml")
330-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist()")))
331
332-- local x = xml.load("music-collection.xml")
333-- inspect(table.sortedkeys(xml.applylpath(x,"//*/taglist(true)")))
334