strc-tag.lua /size: 20 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['strc-tag'] = {
2    version   = 1.001,
3    comment   = "companion to strc-tag.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This is rather experimental code. Tagging happens on the fly and there are two analysers
10-- involved: the pdf backend tagger and the exporter. They share data but there are subtle
11-- differences. Each tag carries a specification and these can be accessed by attribute (the
12-- end of the chain tag) or by so called fullname which is a tagname combined with a number.
13
14local type, next = type, next
15local insert, remove, unpack, concat, merge = table.insert, table.remove, table.unpack, table.concat, table.merge
16local find, topattern, format = string.find, string.topattern, string.format
17local lpegmatch, P, S, C, Cc = lpeg.match, lpeg.P, lpeg.S, lpeg.C, lpeg.Cc
18local allocate = utilities.storage.allocate
19local settings_to_hash = utilities.parsers.settings_to_hash
20local setmetatableindex = table.setmetatableindex
21
22local trace_tags = false  trackers.register("structures.tags", function(v) trace_tags = v end)
23
24local report_tags = logs.reporter("structure","tags")
25
26local attributes      = attributes
27local structures      = structures
28local implement       = interfaces.implement
29
30local a_tagged        = attributes.private('tagged')
31
32local unsetvalue      = attributes.unsetvalue
33local codeinjections  = backends.codeinjections
34
35local texgetattribute = tex.getattribute
36local texsetattribute = tex.setattribute
37
38local taglist         = allocate() -- access by attribute
39local specifications  = allocate() -- access by fulltag
40local labels          = allocate()
41local stack           = { }
42local chain           = { }
43local ids             = { }
44local enabled         = false
45local tagcontext      = { }
46local tagpatterns     = { }
47local lasttags        = { }
48local stacksize       = 0
49local metadata        = nil -- applied to the next element
50local documentdata    = { }
51local extradata       = false
52
53local tags            = structures.tags
54tags.taglist          = taglist -- can best be hidden
55tags.labels           = labels
56tags.patterns         = tagpatterns
57tags.specifications   = specifications
58
59function tags.current()
60    if stacksize > 0 then
61        return stack[stacksize] -- maybe copy or proxy
62    end
63end
64
65-- Tags are internally stored as:
66--
67-- tag>number tag>number tag>number
68
69local p_splitter     = C((1-S(">"))^1) * P(">") * C(P(1)^1)
70tagpatterns.splitter = p_splitter
71
72local properties     = allocate { -- todo: more "record = true" to improve formatting
73
74    document              = { pdf = "Div",        nature = "display" },
75
76    division              = { pdf = "Div",        nature = "display" },
77    paragraph             = { pdf = "P",          nature = "mixed"   },
78    p                     = { pdf = "P",          nature = "mixed"   },
79    construct             = { pdf = "Span",       nature = "inline"  },
80    highlight             = { pdf = "Span",       nature = "inline"  },
81
82    section               = { pdf = "Sect",       nature = "display" },
83    sectioncaption        = { pdf = "Div",        nature = "display", record = true },
84    sectiontitle          = { pdf = "H",          nature = "mixed"   },
85    sectionnumber         = { pdf = "H",          nature = "mixed"   },
86    sectioncontent        = { pdf = "Div",        nature = "display" },
87
88    itemgroup             = { pdf = "L",          nature = "display" },
89    item                  = { pdf = "LI",         nature = "display" },
90    itemtag               = { pdf = "Lbl",        nature = "mixed"   },
91    itemcontent           = { pdf = "LBody",      nature = "mixed"   },
92    itemhead              = { pdf = "Div",        nature = "display" },
93    itembody              = { pdf = "Div",        nature = "display" },
94
95    description           = { pdf = "Div",        nature = "display" },
96    descriptiontag        = { pdf = "Div",        nature = "mixed"   },
97    descriptioncontent    = { pdf = "Div",        nature = "mixed"   },
98    descriptionsymbol     = { pdf = "Span",       nature = "inline"  }, -- note reference
99
100    verbatimblock         = { pdf = "Code",       nature = "display" },
101    verbatimlines         = { pdf = "Code",       nature = "display" },
102    verbatimline          = { pdf = "Code",       nature = "mixed"   },
103    verbatim              = { pdf = "Code",       nature = "inline"  },
104
105    lines                 = { pdf = "Code",       nature = "display" },
106    line                  = { pdf = "Code",       nature = "mixed"   },
107    linenumber            = { pdf = "Span",       nature = "inline"   },
108
109    synonym               = { pdf = "Span",       nature = "inline"  },
110    sorting               = { pdf = "Span",       nature = "inline"  },
111
112    register              = { pdf = "Div",        nature = "display" },
113    registerlocation      = { pdf = "Span",       nature = "inline"  },
114    registersection       = { pdf = "Div",        nature = "display" },
115    registertag           = { pdf = "Span",       nature = "mixed"   },
116    registerentries       = { pdf = "Div",        nature = "display" },
117    registerentry         = { pdf = "Div",        nature = "display" },
118    registercontent       = { pdf = "Span",       nature = "mixed"   },
119    registersee           = { pdf = "Span",       nature = "mixed"   },
120    registerpages         = { pdf = "Span",       nature = "mixed"   },
121    registerpage          = { pdf = "Span",       nature = "mixed"   },
122    registerseparator     = { pdf = "Span",       nature = "inline"  },
123    registerpagerange     = { pdf = "Span",       nature = "mixed"   },
124
125    table                 = { pdf = "Table",      nature = "display" },
126    tablerow              = { pdf = "TR",         nature = "display" },
127    tablecell             = { pdf = "TD",         nature = "mixed"   },
128    tableheadcell         = { pdf = "TH",         nature = "mixed"   },
129    tablehead             = { pdf = "THEAD",      nature = "display" },
130    tablebody             = { pdf = "TBODY",      nature = "display" },
131    tablefoot             = { pdf = "TFOOT",      nature = "display" },
132
133    tabulate              = { pdf = "Table",      nature = "display" },
134    tabulaterow           = { pdf = "TR",         nature = "display" },
135    tabulatecell          = { pdf = "TD",         nature = "mixed"   },
136    tabulateheadcell      = { pdf = "TH",         nature = "mixed"   },
137    tabulatehead          = { pdf = "THEAD",      nature = "display" },
138    tabulatebody          = { pdf = "TBODY",      nature = "display" },
139    tabulatefoot          = { pdf = "TFOOT",      nature = "display" },
140
141    list                  = { pdf = "TOC",        nature = "display" },
142    listitem              = { pdf = "TOCI",       nature = "display" },
143    listtag               = { pdf = "Lbl",        nature = "mixed"   },
144    listcontent           = { pdf = "P",          nature = "mixed"   },
145    listdata              = { pdf = "P",          nature = "mixed"   },
146    listpage              = { pdf = "Reference",  nature = "mixed"   },
147    listtext              = { pdf = "Span",       nature = "inline"  },
148
149    delimitedblock        = { pdf = "BlockQuote", nature = "display" },
150    delimited             = { pdf = "Quote",      nature = "inline"  },
151    delimitedcontent      = { pdf = "Span",       nature = "inline"  },
152    delimitedsymbol       = { pdf = "Span",       nature = "inline"  },
153    subsentence           = { pdf = "Span",       nature = "inline"  },
154    subsentencecontent    = { pdf = "Span",       nature = "inline"  },
155    subsentencesymbol     = { pdf = "Span",       nature = "inline"  },
156
157    label                 = { pdf = "Span",       nature = "mixed"   },
158    number                = { pdf = "Span",       nature = "mixed"   },
159
160    float                 = { pdf = "Div",        nature = "display" }, -- Figure
161    floatcaption          = { pdf = "Caption",    nature = "mixed"   },
162    floatlabel            = { pdf = "Span",       nature = "inline"  },
163    floatnumber           = { pdf = "Span",       nature = "inline"  },
164    floattext             = { pdf = "Span",       nature = "mixed"   },
165    floatcontent          = { pdf = "P",          nature = "mixed"   },
166
167    image                 = { pdf = "P",          nature = "mixed"   },
168    mpgraphic             = { pdf = "P",          nature = "mixed"   },
169
170    formulaset            = { pdf = "Div",        nature = "display" },
171    formula               = { pdf = "Div",        nature = "display" }, -- Formula
172    formulacaption        = { pdf = "Span",       nature = "mixed"   },
173    formulalabel          = { pdf = "Span",       nature = "mixed"   },
174    formulanumber         = { pdf = "Span",       nature = "mixed"   },
175    formulacontent        = { pdf = "P",          nature = "display" },
176    subformula            = { pdf = "Div",        nature = "display" },
177
178    link                  = { pdf = "Link",       nature = "inline"  },
179    reference             = { pdf = "Span",       nature = "inline"  },
180
181    margintextblock       = { pdf = "Span",       nature = "inline"  },
182    margintext            = { pdf = "Span",       nature = "inline"  },
183    marginanchor          = { pdf = "Span",       nature = "inline"  },
184
185    math                  = { pdf = "Div",        nature = "inline"  }, -- no display
186    mn                    = { pdf = "Span",       nature = "mixed"   },
187    mi                    = { pdf = "Span",       nature = "mixed"   },
188    mo                    = { pdf = "Span",       nature = "mixed"   },
189    ms                    = { pdf = "Span",       nature = "mixed"   },
190    mrow                  = { pdf = "Span",       nature = "display" },
191    msubsup               = { pdf = "Span",       nature = "display" },
192    msub                  = { pdf = "Span",       nature = "display" },
193    msup                  = { pdf = "Span",       nature = "display" },
194    merror                = { pdf = "Span",       nature = "mixed"   },
195    munderover            = { pdf = "Span",       nature = "display" },
196    munder                = { pdf = "Span",       nature = "display" },
197    mover                 = { pdf = "Span",       nature = "display" },
198    mtext                 = { pdf = "Span",       nature = "mixed"   },
199    mfrac                 = { pdf = "Span",       nature = "display" },
200    mroot                 = { pdf = "Span",       nature = "display" },
201    msqrt                 = { pdf = "Span",       nature = "display" },
202    mfenced               = { pdf = "Span",       nature = "display" },
203    maction               = { pdf = "Span",       nature = "display" },
204
205    mstacker              = { pdf = "Span",       nature = "display" }, -- these are only internally used
206    mstackertop           = { pdf = "Span",       nature = "display" }, -- these are only internally used
207    mstackerbot           = { pdf = "Span",       nature = "display" }, -- these are only internally used
208    mstackermid           = { pdf = "Span",       nature = "display" }, -- these are only internally used
209
210    mtable                = { pdf = "Table",      nature = "display" }, -- might change
211    mtr                   = { pdf = "TR",         nature = "display" }, -- might change
212    mtd                   = { pdf = "TD",         nature = "display" }, -- might change
213
214    ignore                = { pdf = "Span",       nature = "mixed"   }, -- used internally
215    private               = { pdf = "Span",       nature = "mixed"   }, -- for users (like LS) when they need it
216    metadata              = { pdf = "Div",        nature = "display" },
217    metavariable          = { pdf = "Span",       nature = "mixed"   },
218
219    mid                   = { pdf = "Span",       nature = "inline"  },
220    sub                   = { pdf = "Span",       nature = "inline"  },
221    sup                   = { pdf = "Span",       nature = "inline"  },
222    subsup                = { pdf = "Span",       nature = "inline"  },
223
224    combination           = { pdf = "Span",       nature = "display" },
225    combinationpair       = { pdf = "Span",       nature = "display" },
226    combinationcontent    = { pdf = "Span",       nature = "mixed"   },
227    combinationcaption    = { pdf = "Span",       nature = "mixed"   },
228
229    publications          = { pdf = "Div",        nature = "display" },
230    publication           = { pdf = "Div",        nature = "mixed"   },
231    pubfld                = { pdf = "Span",       nature = "inline"  },
232
233    block                 = { pdf = "Div",        nature = "display"  },
234    userdata              = { pdf = "Div",        nature = "display"  },
235
236}
237
238tags.properties = properties
239
240local patterns = setmetatableindex(function(t,tag)
241    local v = topattern("^" .. tag .. ">")
242    t[tag] = v
243    return v
244end)
245
246function tags.locatedtag(tag)
247    local attribute = texgetattribute(a_tagged)
248    if attribute >= 0 then
249        local specification = taglist[attribute]
250        if specification then
251            local taglist = specification.taglist
252            local pattern = patterns[tag]
253            for i=#taglist,1,-1 do
254                local t = taglist[i]
255                if find(t,pattern) then
256                    return t
257                end
258            end
259        end
260    else
261        -- enabled but not auto
262    end
263    return false -- handy as bogus index
264end
265
266function structures.atlocation(str)
267    local specification = taglist[texgetattribute(a_tagged)]
268    if specification then
269        local list = specification.taglist
270        if list then
271            local pattern = patterns[str]
272            for i=#list,1,-1 do
273                if find(list[i],pattern) then
274                    return true
275                end
276            end
277        end
278    end
279end
280
281function tags.setproperty(tag,key,value)
282    local p = properties[tag]
283    if p then
284        p[key] = value
285    else
286        properties[tag] = { [key] = value }
287    end
288end
289
290function tags.setaspect(key,value)
291    local tag = chain[stacksize]
292    if tag then
293        local p = properties[tag]
294        if p then
295            p[key] = value
296        else
297            properties[tag] = { [key] = value }
298        end
299    end
300end
301
302function tags.registermetadata(data)
303    local d = settings_to_hash(data)
304    if #chain > 1 then
305        if metadata then
306            merge(metadata,d)
307        else
308            metadata = d
309        end
310    else
311        merge(documentdata,d)
312    end
313end
314
315function tags.getmetadata()
316    return documentdata or { }
317end
318
319function tags.registerextradata(name,serializer)
320    if type(serializer) == "function" then
321        if extradata then
322            extradata[name] = serializer
323        else
324            extradata = { [name] = serializer }
325        end
326    end
327end
328
329function tags.getextradata()
330    return extradata
331end
332
333function tags.start(tag,specification)
334    if not enabled then
335        codeinjections.enabletags()
336        enabled = true
337    end
338    --
339    labels[tag] = tag -- can go away
340    --
341    local attribute = #taglist + 1
342    local tagindex  = (ids[tag] or 0) + 1
343    --
344    local completetag = tag .. ">" .. tagindex
345    --
346    ids[tag]      = tagindex
347    lasttags[tag] = tagindex
348    stacksize     = stacksize + 1
349    --
350    chain[stacksize] = completetag
351    stack[stacksize] = attribute
352    tagcontext[tag]  = completetag
353    --
354    local tagnesting = { unpack(chain,1,stacksize) } -- a copy so we can add actualtext
355    --
356    if specification then
357        specification.attribute = attribute
358        specification.tagindex  = tagindex
359        specification.taglist   = tagnesting
360        specification.tagname   = tag
361        if metadata then
362            specification.metadata = metadata
363            metadata = nil
364        end
365        local userdata = specification.userdata
366        if userdata == "" then
367            specification.userdata = nil
368        elseif type(userdata) == "string"  then
369            specification.userdata = settings_to_hash(userdata)
370        end
371        local detail = specification.detail
372        if detail == "" then
373            specification.detail = nil
374        end
375        local parents = specification.parents
376        if parents == "" then
377            specification.parents = nil
378        end
379    else
380        specification = {
381            attribute = attribute,
382            tagindex  = tagindex,
383            taglist   = tagnesting,
384            tagname   = tag,
385            metadata  = metadata,
386        }
387        metadata = nil
388    end
389    --
390    taglist[attribute]          = specification
391    specifications[completetag] = specification
392    --
393    if completetag == "document>1" then
394        specification.metadata = documentdata
395    end
396    --
397    texsetattribute(a_tagged,attribute)
398    return attribute
399end
400
401function tags.restart(attribute)
402    stacksize = stacksize + 1
403    if type(attribute) == "number" then
404        local taglist = taglist[attribute].taglist
405        chain[stacksize] = taglist[#taglist]
406    else
407        chain[stacksize] = attribute -- a string
408        attribute = #taglist + 1
409        taglist[attribute] = { taglist = { unpack(chain,1,stacksize) } }
410    end
411    stack[stacksize] = attribute
412    texsetattribute(a_tagged,attribute)
413    return attribute
414end
415
416function tags.stop()
417    if stacksize > 0 then
418        stacksize = stacksize - 1
419    end
420    local t = stack[stacksize]
421    if not t then
422        if trace_tags then
423            report_tags("ignoring end tag, previous chain: %s",stacksize > 0 and concat(chain," ",1,stacksize) or "none")
424        end
425        t = unsetvalue
426    end
427    texsetattribute(a_tagged,t)
428    return t
429end
430
431function tags.getid(tag,detail)
432    return ids[tag] or "?"
433end
434
435function tags.last(tag)
436    return lasttags[tag] -- or false
437end
438
439function tags.lastinchain(tag)
440    if tag and tag ~= "" then
441        return tagcontext[tag]
442    else
443        return chain[stacksize]
444    end
445end
446
447local strip = C((1-S(">"))^1)
448
449function tags.elementtag()
450    local fulltag = chain[stacksize]
451    if fulltag then
452        return lpegmatch(strip,fulltag)
453    end
454end
455
456function tags.strip(fulltag)
457    return lpegmatch(strip,fulltag)
458end
459
460function tags.setuserproperties(tag,list)
461    if not list or list == "" then
462        tag, list = chain[stacksize], tag
463    else
464        tag = tagcontext[tag]
465    end
466    if tag then -- an attribute now
467        local l = settings_to_hash(list)
468        local s = specifications[tag]
469        if s then
470            local u = s.userdata
471            if u then
472                for k, v in next, l do
473                    u[k] = v
474                end
475            else
476                s.userdata = l
477            end
478        else
479           -- error
480        end
481    end
482end
483
484function tags.handler(head)  -- we need a dummy
485    return head, false
486end
487
488statistics.register("structure elements", function()
489    if enabled then
490        if stacksize > 0 then
491            return format("%s element chains identified, open chain: %s ",#taglist,concat(chain," => ",1,stacksize))
492        else
493            return format("%s element chains identified",#taglist)
494        end
495    end
496end)
497
498directives.register("backend.addtags", function(v)
499    if not enabled then
500        codeinjections.enabletags()
501        enabled = true
502    end
503end)
504
505-- interface
506
507local starttag = tags.start
508
509implement {
510    name      = "starttag",
511    actions   = starttag,
512    arguments = "string",
513}
514
515implement {
516    name      = "stoptag",
517    actions   = tags.stop,
518}
519
520implement {
521    name      = "starttag_u",
522    scope     = "private",
523    actions   = function(tag,userdata) starttag(tag,{ userdata = userdata }) end,
524    arguments = "2 strings",
525}
526
527implement {
528    name      = "starttag_d",
529    scope     = "private",
530    actions   = function(tag,detail) starttag(tag,{ detail = detail }) end,
531    arguments = "2 strings",
532}
533
534implement {
535    name      = "starttag_c",
536    scope     = "private",
537    actions   = function(tag,detail,parents) starttag(tag,{ detail = detail, parents = parents }) end,
538    arguments = "3 strings",
539}
540
541implement { name = "settagaspect",     actions = tags.setaspect,   arguments = "2 strings" }
542implement { name = "settagproperty",   actions = tags.setproperty, arguments = "3 strings" }
543implement { name = "settagproperty_b", actions = tags.setproperty, arguments = { "string", "'backend'", "string" }, scope = "private" }
544implement { name = "settagproperty_n", actions = tags.setproperty, arguments = { "string", "'nature'",  "string" }, scope = "private" }
545
546implement { name = "getelementtag",    actions = { tags.elementtag, context } }
547
548implement {
549    name      = "setelementuserproperties",
550    scope     = "private",
551    actions   = tags.setuserproperties,
552    arguments = "2 strings",
553}
554
555implement {
556    name      = "doifelseinelement",
557    actions   = { structures.atlocation, commands.testcase },
558    arguments = "string",
559}
560
561implement {
562    name      = "settaggedmetadata",
563    actions   = tags.registermetadata,
564    arguments = "string",
565}
566