strc-tag.lmt /size: 22 Kb    last modification: 2024-01-16 09:03
1if not modules then modules = { } end modules ['strc-tag'] = {
2    version   = 1.001,
3    comment   = "companion to strc-tag.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This is rather experimental code. Tagging happens on the fly and there are two analysers
10-- involved: the pdf backend tagger and the exporter. They share data but there are subtle
11-- differences. Each tag carries a specification and these can be accessed by attribute (the
12-- end of the chain tag) or by so called fullname which is a tagname combined with a number.
13
14local type, next = type, next
15local insert, remove, unpack, concat, merge = table.insert, table.remove, table.unpack, table.concat, table.merge
16local find, topattern, format = string.find, string.topattern, string.format
17local lpegmatch, P, S, C, Cc = lpeg.match, lpeg.P, lpeg.S, lpeg.C, lpeg.Cc
18local allocate = utilities.storage.allocate
19local settings_to_hash = utilities.parsers.settings_to_hash
20local setmetatableindex = table.setmetatableindex
21
22local trace_tags = false  trackers.register("structures.tags", function(v) trace_tags = v end)
23
24local report_tags = logs.reporter("structure","tags")
25
26local attributes      = attributes
27local structures      = structures
28local implement       = interfaces.implement
29
30local a_tagged        = attributes.private('tagged')
31
32local unsetvalue      = attributes.unsetvalue
33local codeinjections  = backends.codeinjections
34
35local texgetattribute = tex.getattribute
36local texsetattribute = tex.setattribute
37
38local taglist         = allocate() -- access by attribute
39local specifications  = allocate() -- access by fulltag
40local labels          = allocate()
41local stack           = { }
42local chain           = { }
43local ids             = { }
44local enabled         = false
45local tagcontext      = { }
46local tagpatterns     = { }
47local lasttags        = { }
48local stacksize       = 0
49local metadata        = nil -- applied to the next element
50local documentdata    = { }
51local extradata       = false
52
53local tags            = structures.tags
54tags.taglist          = taglist -- can best be hidden
55tags.labels           = labels
56tags.patterns         = tagpatterns
57tags.specifications   = specifications
58
59function tags.current()
60    if stacksize > 0 then
61        return stack[stacksize] -- maybe copy or proxy
62    end
63end
64
65-- Tags are internally stored as:
66--
67-- tag>number tag>number tag>number
68
69local p_splitter     = C((1-S(">"))^1) * P(">") * C(P(1)^1)
70tagpatterns.splitter = p_splitter
71
72local properties     = allocate { -- todo: more "record = true" to improve formatting
73
74    document              = { pdf = "Div",        nature = "display" },
75
76    division              = { pdf = "Div",        nature = "display" },
77    paragraph             = { pdf = "P",          nature = "mixed"   },
78    p                     = { pdf = "P",          nature = "mixed"   },
79    construct             = { pdf = "Span",       nature = "inline"  },
80    highlight             = { pdf = "Span",       nature = "inline"  },
81
82    section               = { pdf = "Sect",       nature = "display" },
83    sectioncaption        = { pdf = "Div",        nature = "display", record = true },
84    sectiontitle          = { pdf = "H",          nature = "mixed"   },
85    sectionnumber         = { pdf = "H",          nature = "mixed"   },
86    sectioncontent        = { pdf = "Div",        nature = "display" },
87
88    itemgroup             = { pdf = "L",          nature = "display" },
89    item                  = { pdf = "LI",         nature = "display" },
90    itemtag               = { pdf = "Lbl",        nature = "mixed"   },
91    itemcontent           = { pdf = "LBody",      nature = "mixed"   },
92    itemhead              = { pdf = "Div",        nature = "display" },
93    itembody              = { pdf = "Div",        nature = "display" },
94
95    description           = { pdf = "Div",        nature = "display" },
96    descriptiontag        = { pdf = "Div",        nature = "mixed"   },
97    descriptioncontent    = { pdf = "Div",        nature = "mixed"   },
98    descriptionsymbol     = { pdf = "Span",       nature = "inline"  }, -- note reference
99
100    verbatimblock         = { pdf = "Code",       nature = "display" },
101    verbatimlines         = { pdf = "Code",       nature = "display" },
102    verbatimline          = { pdf = "Code",       nature = "mixed"   },
103    verbatim              = { pdf = "Code",       nature = "inline"  },
104
105    lines                 = { pdf = "Code",       nature = "display" },
106    line                  = { pdf = "Code",       nature = "mixed"   },
107    linenumber            = { pdf = "Span",       nature = "inline"   },
108
109    synonym               = { pdf = "Span",       nature = "inline"  },
110    sorting               = { pdf = "Span",       nature = "inline"  },
111
112    register              = { pdf = "Div",        nature = "display" },
113    registerlocation      = { pdf = "Span",       nature = "inline"  },
114    registersection       = { pdf = "Div",        nature = "display" },
115    registertag           = { pdf = "Span",       nature = "mixed"   },
116    registerentries       = { pdf = "Div",        nature = "display" },
117    registerentry         = { pdf = "Div",        nature = "display" },
118    registercontent       = { pdf = "Span",       nature = "mixed"   },
119    registersee           = { pdf = "Span",       nature = "mixed"   },
120    registerpages         = { pdf = "Span",       nature = "mixed"   },
121    registerpage          = { pdf = "Span",       nature = "mixed"   },
122    registerseparator     = { pdf = "Span",       nature = "inline"  },
123    registerpagerange     = { pdf = "Span",       nature = "mixed"   },
124
125    table                 = { pdf = "Table",      nature = "display" },
126    tablerow              = { pdf = "TR",         nature = "display" },
127    tablecell             = { pdf = "TD",         nature = "mixed"   },
128    tableheadcell         = { pdf = "TH",         nature = "mixed"   },
129    tablehead             = { pdf = "THEAD",      nature = "display" },
130    tablebody             = { pdf = "TBODY",      nature = "display" },
131    tablefoot             = { pdf = "TFOOT",      nature = "display" },
132
133    tabulate              = { pdf = "Table",      nature = "display" },
134    tabulaterow           = { pdf = "TR",         nature = "display" },
135    tabulatecell          = { pdf = "TD",         nature = "mixed"   },
136    tabulateheadcell      = { pdf = "TH",         nature = "mixed"   },
137    tabulatehead          = { pdf = "THEAD",      nature = "display" },
138    tabulatebody          = { pdf = "TBODY",      nature = "display" },
139    tabulatefoot          = { pdf = "TFOOT",      nature = "display" },
140
141    list                  = { pdf = "TOC",        nature = "display" },
142    listitem              = { pdf = "TOCI",       nature = "display" },
143    listtag               = { pdf = "Lbl",        nature = "mixed"   },
144    listcontent           = { pdf = "P",          nature = "mixed"   },
145    listdata              = { pdf = "P",          nature = "mixed"   },
146    listpage              = { pdf = "Reference",  nature = "mixed"   },
147    listtext              = { pdf = "Span",       nature = "inline"  },
148
149    delimitedblock        = { pdf = "BlockQuote", nature = "display" },
150    delimited             = { pdf = "Quote",      nature = "inline"  },
151    delimitedcontent      = { pdf = "Span",       nature = "inline"  },
152    delimitedsymbol       = { pdf = "Span",       nature = "inline"  },
153    subsentence           = { pdf = "Span",       nature = "inline"  },
154    subsentencecontent    = { pdf = "Span",       nature = "inline"  },
155    subsentencesymbol     = { pdf = "Span",       nature = "inline"  },
156
157    label                 = { pdf = "Span",       nature = "mixed"   },
158    number                = { pdf = "Span",       nature = "mixed"   },
159
160    float                 = { pdf = "Div",        nature = "display" }, -- Figure
161    floatcaption          = { pdf = "Caption",    nature = "mixed"   },
162    floatlabel            = { pdf = "Span",       nature = "inline"  },
163    floatnumber           = { pdf = "Span",       nature = "inline"  },
164    floattext             = { pdf = "Span",       nature = "mixed"   },
165    floatcontent          = { pdf = "P",          nature = "mixed"   },
166
167    image                 = { pdf = "P",          nature = "mixed"   },
168    mpgraphic             = { pdf = "P",          nature = "mixed"   },
169
170    formulaset            = { pdf = "Div",        nature = "display" },
171    formula               = { pdf = "Div",        nature = "display" }, -- Formula
172    formulacaption        = { pdf = "Span",       nature = "mixed"   },
173    formulalabel          = { pdf = "Span",       nature = "mixed"   },
174    formulanumber         = { pdf = "Span",       nature = "mixed"   },
175    formulacontent        = { pdf = "P",          nature = "display" },
176    subformula            = { pdf = "Div",        nature = "display" },
177
178    link                  = { pdf = "Link",       nature = "inline"  },
179    reference             = { pdf = "Span",       nature = "inline"  },
180
181    margintextblock       = { pdf = "Span",       nature = "inline"  },
182    margintext            = { pdf = "Span",       nature = "inline"  },
183    marginanchor          = { pdf = "Span",       nature = "inline"  },
184
185    math                  = { pdf = "Div",        nature = "inline",  namespace = "mathml" }, -- no display
186    mn                    = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
187    mi                    = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
188    mo                    = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
189    ms                    = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
190    mrow                  = { pdf = "Span",       nature = "display", namespace = "mathml" },
191    msubsup               = { pdf = "Span",       nature = "display", namespace = "mathml" },
192    msub                  = { pdf = "Span",       nature = "display", namespace = "mathml" },
193    msup                  = { pdf = "Span",       nature = "display", namespace = "mathml" },
194    merror                = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
195    munderover            = { pdf = "Span",       nature = "display", namespace = "mathml" },
196    munder                = { pdf = "Span",       nature = "display", namespace = "mathml" },
197    mover                 = { pdf = "Span",       nature = "display", namespace = "mathml" },
198    mtext                 = { pdf = "Span",       nature = "mixed",   namespace = "mathml" },
199    mfrac                 = { pdf = "Span",       nature = "display", namespace = "mathml" },
200    mroot                 = { pdf = "Span",       nature = "display", namespace = "mathml" },
201    msqrt                 = { pdf = "Span",       nature = "display", namespace = "mathml" },
202    mfenced               = { pdf = "Span",       nature = "display", namespace = "mathml" },
203    maction               = { pdf = "Span",       nature = "display", namespace = "mathml" },
204
205    mstacker              = { pdf = "Span",       nature = "display" }, -- these are only internally used
206    mstackertop           = { pdf = "Span",       nature = "display" }, -- these are only internally used
207    mstackerbot           = { pdf = "Span",       nature = "display" }, -- these are only internally used
208    mstackermid           = { pdf = "Span",       nature = "display" }, -- these are only internally used
209    mextensible           = { pdf = "Span",       nature = "display" }, -- these are only internally used
210
211    mtable                = { pdf = "Table",      nature = "display", namespace = "mathml" }, -- might change
212    mtr                   = { pdf = "TR",         nature = "display", namespace = "mathml" }, -- might change
213    mtd                   = { pdf = "TD",         nature = "display", namespace = "mathml" }, -- might change
214
215    ignore                = { pdf = "Span",       nature = "mixed"   }, -- used internally
216    private               = { pdf = "Span",       nature = "mixed"   }, -- for users (like LS) when they need it
217    metadata              = { pdf = "Div",        nature = "display" },
218    metavariable          = { pdf = "Span",       nature = "mixed"   },
219
220    mid                   = { pdf = "Span",       nature = "inline"  },
221    sub                   = { pdf = "Span",       nature = "inline"  },
222    sup                   = { pdf = "Span",       nature = "inline"  },
223    subsup                = { pdf = "Span",       nature = "inline"  },
224
225    combination           = { pdf = "Span",       nature = "display" },
226    combinationpair       = { pdf = "Span",       nature = "display" },
227    combinationcontent    = { pdf = "Span",       nature = "mixed"   },
228    combinationcaption    = { pdf = "Span",       nature = "mixed"   },
229
230    publications          = { pdf = "Div",        nature = "display" },
231    publication           = { pdf = "Div",        nature = "mixed"   },
232    pubfld                = { pdf = "Span",       nature = "inline"  },
233
234    block                 = { pdf = "Div",        nature = "display"  },
235    userdata              = { pdf = "Div",        nature = "display"  },
236
237}
238
239tags.properties = properties
240
241local patterns = setmetatableindex(function(t,tag)
242    local v = topattern("^" .. tag .. ">")
243    t[tag] = v
244    return v
245end)
246
247function tags.locatedtag(tag)
248    local attribute = texgetattribute(a_tagged)
249    if attribute >= 0 then
250        local specification = taglist[attribute]
251        if specification then
252            local taglist = specification.taglist
253            local pattern = patterns[tag]
254            for i=#taglist,1,-1 do
255                local t = taglist[i]
256                if find(t,pattern) then
257                    return t
258                end
259            end
260        end
261    else
262        -- enabled but not auto
263    end
264    return false -- handy as bogus index
265end
266
267function structures.atlocation(str) -- not used
268    local specification = taglist[texgetattribute(a_tagged)]
269    if specification then
270        local list = specification.taglist
271        if list then
272            local pattern = patterns[str]
273            for i=#list,1,-1 do
274                if find(list[i],pattern) then
275                    return true
276                end
277            end
278        end
279    end
280end
281
282function tags.setproperty(tag,key,value)
283    local p = properties[tag]
284    if p then
285        p[key] = value
286    else
287        properties[tag] = { [key] = value }
288    end
289end
290
291function tags.setaspect(key,value)
292    local tag = chain[stacksize]
293    if tag then
294        local p = properties[tag]
295        if p then
296            p[key] = value
297        else
298            properties[tag] = { [key] = value }
299        end
300    end
301end
302
303function tags.registermetadata(data)
304    local d = settings_to_hash(data)
305    if #chain > 1 then
306        if metadata then
307            merge(metadata,d)
308        else
309            metadata = d
310        end
311    else
312        merge(documentdata,d)
313    end
314end
315
316function tags.getmetadata()
317    return documentdata or { }
318end
319
320function tags.registerextradata(name,serializer)
321    if type(serializer) == "function" then
322        if extradata then
323            extradata[name] = serializer
324        else
325            extradata = { [name] = serializer }
326        end
327    end
328end
329
330function tags.getextradata()
331    return extradata
332end
333
334function tags.enabled()
335    return enabled
336end
337
338function tags.start(tag,specification)
339    if not enabled then
340        codeinjections.enabletags()
341        enabled = true
342    end
343    --
344    labels[tag] = tag -- can go away
345    --
346    local attribute = #taglist + 1
347    local tagindex  = (ids[tag] or 0) + 1
348    --
349    local completetag = tag .. ">" .. tagindex
350    --
351    ids[tag]      = tagindex
352    lasttags[tag] = tagindex
353    stacksize     = stacksize + 1
354    --
355    chain[stacksize] = completetag
356    stack[stacksize] = attribute
357    tagcontext[tag]  = completetag
358    --
359    local tagnesting = { unpack(chain,1,stacksize) } -- a copy so we can add actualtext
360    --
361    if specification then
362        specification.attribute = attribute
363        specification.tagindex  = tagindex
364        specification.taglist   = tagnesting
365        specification.tagname   = tag
366        if metadata then
367            specification.metadata = metadata
368            metadata = nil
369        end
370        local userdata = specification.userdata
371        if userdata == "" then
372            specification.userdata = nil
373        elseif type(userdata) == "string"  then
374            specification.userdata = settings_to_hash(userdata)
375        end
376        local detail = specification.detail
377        if detail == "" then
378            specification.detail = nil
379        end
380        local parents = specification.parents
381        if parents == "" then
382            specification.parents = nil
383        end
384    else
385        specification = {
386            attribute = attribute,
387            tagindex  = tagindex,
388            taglist   = tagnesting,
389            tagname   = tag,
390            metadata  = metadata,
391        }
392        metadata = nil
393    end
394    --
395    taglist[attribute]          = specification
396    specifications[completetag] = specification
397    --
398    if completetag == "document>1" then
399        specification.metadata = documentdata
400    end
401    --
402    texsetattribute(a_tagged,attribute)
403    return attribute
404end
405
406function tags.restart(attribute)
407    stacksize = stacksize + 1
408    if type(attribute) == "number" then
409        local taglist = taglist[attribute].taglist
410        chain[stacksize] = taglist[#taglist]
411    else
412        chain[stacksize] = attribute -- a string
413        attribute = #taglist + 1
414        taglist[attribute] = { taglist = { unpack(chain,1,stacksize) } }
415    end
416    stack[stacksize] = attribute
417    texsetattribute(a_tagged,attribute)
418    return attribute
419end
420
421function tags.stop()
422    if stacksize > 0 then
423        stacksize = stacksize - 1
424    end
425    local t = stack[stacksize]
426    if not t then
427        if trace_tags then
428            report_tags("ignoring end tag, previous chain: %s",stacksize > 0 and concat(chain," ",1,stacksize) or "none")
429        end
430        t = unsetvalue
431    end
432    texsetattribute(a_tagged,t)
433    return t
434end
435
436function tags.getid(tag,detail)
437    return ids[tag] or "?"
438end
439
440function tags.last(tag)
441    return lasttags[tag] -- or false
442end
443
444function tags.lastinchain(tag)
445    if tag and tag ~= "" then
446        return tagcontext[tag]
447    else
448        return chain[stacksize]
449    end
450end
451
452local strip = C((1-S(">"))^1)
453
454function tags.elementtag()
455    local fulltag = chain[stacksize]
456    if fulltag then
457        return lpegmatch(strip,fulltag)
458    end
459end
460
461function tags.strip(fulltag)
462    return lpegmatch(strip,fulltag)
463end
464
465function tags.setuserproperties(tag,list)
466    if not list or list == "" then
467        tag, list = chain[stacksize], tag
468    else
469        tag = tagcontext[tag]
470    end
471    if tag then -- an attribute now
472        local l = settings_to_hash(list)
473        local s = specifications[tag]
474        if s then
475            local u = s.userdata
476            if u then
477                for k, v in next, l do
478                    u[k] = v
479                end
480            else
481                s.userdata = l
482            end
483        else
484           -- error
485        end
486    end
487end
488
489function tags.handler(head)  -- we need a dummy
490    return head, false
491end
492
493statistics.register("structure elements", function()
494    if enabled then
495        if stacksize > 0 then
496            return format("%s element chains identified, open chain: %s ",#taglist,concat(chain," => ",1,stacksize))
497        else
498            return format("%s element chains identified",#taglist)
499        end
500    end
501end)
502
503directives.register("backend.addtags", function(v)
504    if not enabled then
505        codeinjections.enabletags()
506        enabled = true
507    end
508end)
509
510-- interface
511
512local starttag = tags.start
513
514implement {
515    name      = "strc_tags_start",
516    public    = true,
517    protected = true,
518    actions   = starttag,
519    arguments = "argument",
520}
521
522implement {
523    name      = "strc_tags_stop",
524    public    = true,
525    protected = true,
526    actions   = tags.stop,
527}
528
529implement {
530    name      = "strc_tags_start_userdata",
531    public    = true,
532    protected = true,
533    actions   = function(tag,userdata) starttag(tag,{ userdata = userdata }) end,
534    arguments = { "optional", "optional" },
535}
536
537implement {
538    name      = "strc_tags_start_detail",
539    public    = true,
540    protected = true,
541    actions   = function(tag,detail) starttag(tag,{ detail = detail }) end,
542    arguments = "2 arguments",
543}
544
545implement {
546    name      = "strc_tags_start_ignore",
547    public    = true,
548    protected = true,
549    actions   = function(detail) starttag("ignore",{ detail = detail }) end,
550    arguments = { "argument" },
551}
552
553implement {
554    name      = "strc_tags_start_chained",
555    public    = true,
556    protected = true,
557    actions   = function(tag,detail,parents) starttag(tag,{ detail = detail, parents = parents }) end,
558    arguments = "3 arguments",
559}
560
561implement {
562    name      = "strc_tags_set_aspect",
563    public    = true,
564    protected = true,
565    actions   = tags.setaspect,
566    arguments = "2 arguments"
567}
568
569implement {
570    name      = "settagproperty",
571    actions   = tags.setproperty,
572    arguments = "3 arguments"
573}
574
575implement {
576    name      = "setelementbackendtag",
577    public    = true,
578    protected = true,
579    actions   = tags.setproperty,
580--     arguments = { "optional", "'backend'", "optional" },
581    arguments = { "optional", "'pdf'", "optional" },
582}
583
584implement {
585    name      = "setelementnature",
586    public    = true,
587    protected = true,
588    actions   = tags.setproperty,
589    arguments = { "optional", "'nature'",  "optional" },
590}
591
592implement {
593    name      = "strc_tags_get_element_tag",
594    public    = true,
595    protected = true,
596    actions   = { tags.elementtag, context }
597}
598
599implement {
600    name      = "strc_tags_set_element_user_properties",
601    public    = true,
602    protected = true,
603    actions   = tags.setuserproperties,
604    arguments = { "optional", "optional" },
605}
606
607implement {
608    name      = "doifelseinelement",
609    public    = true,
610    protected = true,
611    actions   = { structures.atlocation, commands.doifelse },
612    arguments = "argument",
613}
614
615implement {
616    name      = "settaggedmetadata",
617    public    = true,
618    protected = true,
619    actions   = tags.registermetadata,
620    arguments = "optional",
621}
622
623implement {
624    name      = "settagginglevel",
625    protected = true,
626    arguments = "argument",
627    actions   = function(level)
628        if level == interfaces.variables.none then
629            codeinjections.reducetags()
630        end
631    end,
632}
633