lpdf-tag.lmt /size: 26 Kb    last modification: 2024-01-16 09:02
1if not modules then modules = { } end modules ['lpdf-tag'] = {
2    version   = 1.001,
3    comment   = "companion to lpdf-tag.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local next, type = next, type
10local format, match, gmatch = string.format, string.match, string.gmatch
11local concat, sortedhash = table.concat, table.sortedhash
12local lpegmatch, P, S, C = lpeg.match, lpeg.P, lpeg.S, lpeg.C
13local settings_to_hash = utilities.parsers.settings_to_hash
14local formatters = string.formatters
15
16local trace_tags = false  trackers.register("structures.tags",      function(v) trace_tags = v end)
17local trace_info = false  trackers.register("structures.tags.info", function(v) trace_info = v end)
18
19local report_tags = logs.reporter("backend","tags")
20
21local pdfbackend          = backends.registered.pdf
22local nodeinjections      = pdfbackend.nodeinjections
23local codeinjections      = pdfbackend.codeinjections
24
25local enableaction        = nodes.tasks.enableaction
26local disableaction       = nodes.tasks.disableaction
27
28local lpdf                = lpdf
29local pdfdictionary       = lpdf.dictionary
30local pdfarray            = lpdf.array
31local pdfboolean          = lpdf.boolean
32local pdfconstant         = lpdf.constant
33local pdfreference        = lpdf.reference
34local pdfunicode          = lpdf.unicode
35local pdfmakenametree     = lpdf.makenametree
36
37local addtocatalog        = lpdf.addtocatalog
38local addtopageattributes = lpdf.addtopageattributes
39
40local pdfflushobject      = lpdf.flushobject
41local pdfreserveobject    = lpdf.reserveobject
42local pdfpagereference    = lpdf.pagereference
43
44local texgetcount         = tex.getcount
45
46local nodes               = nodes
47local nodecodes           = nodes.nodecodes
48
49local hlist_code          = nodecodes.hlist
50local vlist_code          = nodecodes.vlist
51local glyph_code          = nodecodes.glyph
52
53local a_tagged            = attributes.private('tagged')
54local a_image             = attributes.private('image')
55
56local nuts                = nodes.nuts
57
58local nodepool            = nuts.pool
59local setstate            = nodepool.setstate
60local register            = nodepool.register
61
62local getid               = nuts.getid
63local getattr             = nuts.getattr
64local getprev             = nuts.getprev
65local getnext             = nuts.getnext
66local getlist             = nuts.getlist
67local getchar             = nuts.getchar
68
69local tailoflist          = nuts.tail
70local setlink             = nuts.setlink
71local setlist             = nuts.setlist
72
73local copy_node           = nuts.copy
74local tosequence          = nuts.tosequence
75
76local nextnode            = nuts.traversers.node
77
78local structure_kids   -- delayed
79local structure_ref    -- delayed
80local parent_ref       -- delayed
81local root             -- delayed
82local names               = { }
83local tree                = { }
84local firstintree         = false
85local lastintree          = false
86local elements            = { }
87
88local structurestags      = structures.tags
89local taglist             = structurestags.taglist
90local specifications      = structurestags.specifications
91local usedlabels          = structurestags.labels
92local properties          = structurestags.properties
93local usewithcare         = structurestags.usewithcare
94
95local usedmapping         = { }
96
97----- tagsplitter         = structurestags.patterns.splitter
98
99local embeddedtags        = false -- true will id all, for tracing, otherwise table
100local f_tagid             = formatters["%s-%04i"]
101local embeddedfilelist    = pdfarray() -- /AF crap
102
103-- for testing, not that it was ever used:
104
105directives.register("structures.tags.embed",function(v)
106    if type(v) == "string" then
107        if type(embeddedtags) ~= "table" then
108            embeddedtags = { }
109        end
110        for s in gmatch(v,"([^, ]+)") do
111            embeddedtags[s] = true
112        end
113    elseif v and not embeddedtags then
114        embeddedtags = true
115    end
116end)
117
118-- for old times sake, not that it was ever used:
119
120directives.register("structures.tags.embedmath",function(v)
121    if not v then
122        -- only enable
123    elseif embeddedtags == true then
124        -- already all tagged
125    elseif embeddedtags then
126        embeddedtags.math = true
127    else
128        embeddedtags = { math = true }
129    end
130end)
131
132function codeinjections.maptag(original,target,kind)
133    mapping[original] = { target, kind or "inline" }
134end
135
136-- mostly the same as the annotations tree
137
138local usenamespace = false  experiments.register("structures.tags.namespaces", function(v) usenamespace = v end)
139
140local namespaceurls = {
141    mathml = "http://www.w3.org/1998/Math/MathML",
142}
143
144local function finishstructure()
145    if root and #structure_kids > 0 then
146        local nums   = pdfarray()
147        local n      = 0
148        for i=firstintree,lastintree do
149            local ti = tree[i]
150            if ti then
151                n = n + 1 ; nums[n] = i - 1
152                n = n + 1 ; nums[n] = pdfreference(pdfflushobject(ti))
153            else
154                report_tags("beware: missing page %i in tree", i)
155            end
156        end
157        local parenttree = pdfdictionary {
158            Nums = nums
159        }
160        local idtree = pdfmakenametree(names)
161        --
162        local rolemaps = usenamespace and { }
163        local rolemap  = pdfdictionary() -- main one
164        for k, v in next, usedmapping do
165            k = usedlabels[k] or k
166            local p = properties[k]
167            if not p then
168                print("UNDEFINED", k)
169            end
170            local n = p and p.namespace
171            if rolemaps and n then
172                local r = rolemaps[n]
173                if not r then
174                    r = pdfdictionary()
175                    rolemaps[n] = r
176                end
177                r[k] = pdfconstant(k) -- maybe other tag
178            else
179                rolemap[k] = pdfconstant(p and p.pdf or "Span") -- or "Div"
180            end
181        end
182        local namespaces = rolemaps and next(rolemaps) and pdfarray { } or nil
183        if namespaces then
184            for k, v in table.sortedhash(rolemaps) do
185                namespaces[#namespaces+1] = pdfdictionary {
186                    Type      = pdfconstant("Namespace"),
187                    NS        = pdfunicode(namespaceurls[k] or k),
188                    RoleMapNS = v,
189                }
190            end
191        end
192        local structuretree = pdfdictionary {
193            Type       = pdfconstant("StructTreeRoot"),
194            K          = pdfreference(pdfflushobject(structure_kids)),
195            ParentTree = pdfreference(pdfflushobject(parent_ref,parenttree)),
196            IDTree     = idtree,
197            RoleMap    = rolemap, -- sorted ?
198            Namespaces = namespaces,
199        }
200        pdfflushobject(structure_ref,structuretree)
201        addtocatalog("StructTreeRoot",pdfreference(structure_ref))
202        --
203        if lpdf.majorversion() == 1 then
204            local markinfo = pdfdictionary {
205                Marked         = pdfboolean(true) or nil,
206             -- UserProperties = pdfboolean(true), -- maybe some day
207             -- Suspects       = pdfboolean(true) or nil,
208             -- AF             = #embeddedfilelist > 0 and pdfreference(pdfflushobject(embeddedfilelist)) or nil,
209            }
210            addtocatalog("MarkInfo",pdfreference(pdfflushobject(markinfo)))
211        end
212        --
213        for fulltag, element in sortedhash(elements) do -- sorting is easier on comparing pdf
214            local kids = element.kids
215         -- if element.tag == "link" then
216         --     local d = kids[2]
217         --     if type(d) == "table" then
218         --         local refatt = element.refatt
219         --         if refatt then
220         --             local refobj = codeinjections.getrefobj(refatt)
221         --             if refobj then
222         --                 d.Obj = pdfreference(refobj)
223         --             end
224         --         end
225         --     end
226         -- end
227            pdfflushobject(element.knum,kids)
228        end
229    end
230end
231
232lpdf.registerdocumentfinalizer(finishstructure,"document structure")
233
234local index, pageref, pagenum, list = 0, nil, 0, nil
235
236local pdf_mcr            = pdfconstant("MCR")
237local pdf_struct_element = pdfconstant("StructElem")
238local pdf_s              = pdfconstant("S")
239local pdf_objr           = pdfconstant("OBJR")
240
241local function initializepage()
242    index   = 0
243    pagenum = texgetcount("realpageno")
244    pageref = pdfreference(pdfpagereference(pagenum))
245    list    = pdfarray()
246    -- hm, can be later than 1
247    if not firstintree then
248        if pagenum > 1 then
249            report_tags("beware: first page in tree is %i", pagenum)
250        end
251        firstintree = pagenum
252        lastintree  = pagenum
253    end
254    if pagenum > lastintree then
255        lastintree = pagenum
256    else
257     -- report_tags("beware: page order problem in tree at page %i", pagenum)
258    end
259    tree[pagenum] = list -- we can flush after done, todo
260end
261
262local function finishpage()
263    -- flush what can be flushed
264    addtopageattributes("StructParents",pagenum-1)
265    -- there might be more
266    addtopageattributes("Tabs",s)
267end
268
269-- here we can flush and free elements that are finished
270
271local pdf_userproperties = pdfconstant("UserProperties")
272
273-- /O /Table
274-- /Headers [ ]
275
276local function makeattribute(t)
277    if t and next(t) then
278        local properties = pdfarray()
279        for k, v in sortedhash(t) do -- easier on comparing pdf
280            properties[#properties+1] = pdfdictionary {
281                N = pdfunicode(k),
282                V = pdfunicode(v),
283            }
284        end
285        return pdfdictionary {
286            O = pdf_userproperties,
287            P = properties,
288        }
289    end
290end
291
292local function makeelement(fulltag,parent)
293    local specification = specifications[fulltag]
294    local tagname       = specification.tagname
295    local tagnameused   = tagname
296    local attributes    = nil
297    if tagname == "ignore" then
298        return false
299    elseif tagname == "mstackertop" or tagname == "mstackerbot" or tagname == "mstackermid" then
300        -- TODO
301        return true
302    elseif tagname == "tabulatecell" then
303        local d = structurestags.gettabulatecell(fulltag)
304        if d and d.kind == 1 then
305            tagnameused = "tabulateheadcell"
306        end
307    elseif tagname == "tablecell" then
308        -- will become a plugin model
309        local d = structurestags.gettablecell(fulltag)
310        if d then
311            if d.kind == 1 then
312                tagnameused = "tableheadcell"
313            end
314            local rows = d.rows    or 1
315            local cols = d.columns or 1
316            if rows > 1 or cols > 1 then
317                attributes = pdfdictionary {
318                    O       = pdfconstant("Table"),
319                    RowSpan = rows > 1 and rows or nil,
320                    ColSpan = cols > 1 and cols or nil,
321                }
322            end
323
324        end
325    end
326    --
327    local detail   = specification.detail
328    local userdata = specification.userdata
329    --
330    usedmapping[tagname] = true
331    --
332    -- specification.attribute is unique
333    --
334    local id = nil
335    local af = nil
336    if embeddedtags then
337        local tagindex = specification.tagindex
338        if embeddedtags == true or embeddedtags[tagname] then
339            id = f_tagid(tagname,tagindex)
340            af = job.fileobjreferences.collected[id]
341            if af then
342                local r = pdfreference(af)
343                af = pdfarray { r }
344             -- embeddedfilelist[#embeddedfilelist+1] = r
345            end
346        end
347    end
348    --
349    local k = pdfarray()
350    local r = pdfreserveobject()
351    local t = usedlabels[tagnameused] or tagnameused
352 -- local a = nil
353    local d = pdfdictionary {
354        Type       = pdf_struct_element,
355        S          = pdfconstant(t),
356        ID         = id,
357        T          = detail and detail or nil,
358        P          = parent.pref,
359        Pg         = pageref,
360        K          = pdfreference(r),
361     -- A          = a and makeattribute(a) or nil,
362        A          = attributes,
363     -- Alt        = " Who cares ",
364     -- ActualText = " Hi Hans ",
365        AF         = af,
366    }
367    local s = pdfreference(pdfflushobject(d))
368    if id and names then
369        names[id] = s
370    end
371    local kids = parent.kids
372    kids[#kids+1] = s
373    local e = {
374        tag  = t,
375        pref = s,
376        kids = k,
377        knum = r,
378        pnum = pagenum
379    }
380    elements[fulltag] = e
381    return e
382end
383
384local f_BDC = formatters["/%s <</MCID %s>> BDC"]
385
386local a_destination = attributes.private('destination')
387local a_reference   = attributes.private('reference')
388
389local function makecontent(start,parent,id,specification)
390    local tag  = parent.tag
391    local kids = parent.kids
392    local last = index
393    if id == "image" then
394        local list  = specification.taglist
395        local data  = usewithcare.images[list[#list]]
396        local label = data and data.label
397        local d = pdfdictionary {
398            Type = pdf_mcr,
399            Pg   = pageref,
400            MCID = last,
401            Alt  = pdfunicode(label ~= "" and label or "image"),
402        }
403        kids[#kids+1] = d
404    elseif pagenum == parent.pnum then
405        kids[#kids+1] = last
406     -- if tag == "link" then
407     --     local ra = getattr(start,a_reference)
408     --     if ra then
409     --         parent.refatt = ra
410     --         kids[#kids+1] = pdfdictionary {
411     --             Type = pdf_objr,
412     --             Obj  = pdfreference(0),
413     --         }
414     --     end
415     -- end
416    else
417        local d = pdfdictionary {
418            Type = pdf_mcr,
419            Pg   = pageref,
420            MCID = last,
421        }
422     -- kids[#kids+1] = pdfreference(pdfflushobject(d))
423        kids[#kids+1] = d
424    end
425    --
426    index = index + 1
427    list[index] = parent.pref -- page related list
428    --
429    return f_BDC(tag,last)
430end
431
432local function makeignore(specification)
433    return "/Artifact BMC"
434end
435
436-- no need to adapt head, as we always operate on lists
437
438local EMCliteral = nil
439local visualize  = nil
440
441local enabled = true
442local reduced = false
443
444updaters.register("tagging.state.disable", function() enabled = false end)
445updaters.register("tagging.state.enable",  function() enabled = true  end)
446
447directives.register("tagging.state.reduced", function(v) reduced = v end)
448
449function codeinjections.reducetags()
450    report_tags("only outer level document tag used")
451    reduced = true
452end
453
454function nodeinjections.addtags(head)
455
456    if not enabled then
457        return
458    end
459
460    if not EMCliteral then
461        EMCliteral = register(setstate("EMC"))
462    end
463
464    local last      = nil
465    local ranges    = { }
466    local range     = nil
467    local nofranges = 0
468
469    if not root then
470        structure_kids = pdfarray()
471        structure_ref  = pdfreserveobject()
472        parent_ref     = pdfreserveobject()
473        root           = { pref = pdfreference(structure_ref), kids = structure_kids }
474        names          = pdfarray()
475    end
476
477    initializepage()
478
479    if reduced then
480
481        local list = getlist(head)
482
483        if list then
484
485            ranges = {
486                { 1, "glyph", list, tailoflist(list), head }
487            }
488
489            nofranges = 1
490
491            taglist = {
492                {
493                    attribute = 1,
494                    metadata  = { },
495                    tagindex  = 1,
496                    taglist   = { "document>1" },
497                    tagname   = "document",
498                },
499            }
500
501        end
502
503    else
504
505        local function collectranges(head,list)
506            for n, id in nextnode, head do
507                if id == glyph_code then
508                    -- maybe also disc
509                    if getchar(n) ~= 0 then
510                        local at = getattr(n,a_tagged) or false -- false: pagebody or so, so artifact
511                        if last ~= at then
512                            range = { at, "glyph", n, n, list } -- attr id start stop list
513                            nofranges = nofranges + 1
514                            ranges[nofranges] = range
515                            last = at
516                        elseif range then
517                            range[4] = n -- stop
518                        end
519                    end
520                elseif id == hlist_code or id == vlist_code then
521                    local at = getattr(n,a_image)
522                    if at then
523                        local at = getattr(n,a_tagged) or false -- false: pagebody or so, so artifact
524                        nofranges = nofranges + 1
525                        ranges[nofranges] = { at, "image", n, n, list } -- attr id start stop list
526                        last = nil
527                    else
528                        local list = getlist(n)
529                        if list then
530                            collectranges(list,n)
531                        end
532                    end
533                end
534            end
535        end
536
537        collectranges(head)
538
539    end
540
541-- inspect(taglist)
542-- inspect(ranges)
543
544    if trace_tags then
545        for i=1,nofranges do
546            local range = ranges[i]
547            local attr  = range[1]
548            local id    = range[2]
549            local start = range[3]
550            local stop  = range[4]
551            local tags  = taglist[attr]
552            if tags then -- not ok ... only first lines
553                report_tags("%s => %s : %05i % t",tosequence(start,start),tosequence(stop,stop),attr,tags.taglist)
554            end
555        end
556    end
557
558    local top    = nil
559    local noftop = 0
560
561    local function inject(start,stop,list,literal,left,right)
562        local prev = getprev(start)
563        if prev then
564            setlink(prev,literal)
565        end
566        if left then
567            setlink(literal,left,start)
568        else
569            setlink(literal,start)
570        end
571        if list and not prev then
572            setlist(list,literal)
573        end
574        local literal = copy_node(EMCliteral)
575        -- use insert instead:
576        local next = getnext(stop)
577        if next then
578            setlink(literal,next)
579        end
580        if right then
581            setlink(stop,right,literal)
582        else
583            setlink(stop,literal)
584        end
585    end
586
587--     local function inject(start,stop,list,literal,left,right)
588--         setlink(getprev(start) or list or true,literal,left or true,start)
589--         setlink(stop,right or true,copy_node(EMCliteral),getnext(stop))
590--     end
591
592    for i=1,nofranges do
593
594        local range = ranges[i]
595        local attr  = range[1]
596        local id    = range[2]
597        local start = range[3]
598        local stop  = range[4]
599        local list  = range[5]
600
601        if attr then
602
603            local specification = taglist[attr]
604            local taglist       = specification.taglist
605            local noftags       = #taglist
606            local common        = 0
607            local literal       = nil
608            local ignore        = false
609
610            if top then
611                for i=1,noftags >= noftop and noftop or noftags do
612                    if top[i] == taglist[i] then
613                        common = i
614                    else
615                        break
616                    end
617                end
618            end
619
620            local prev = common > 0 and elements[taglist[common]] or root
621
622            for j=common+1,noftags do
623                local tag = taglist[j]
624                local prv = elements[tag] or makeelement(tag,prev)
625                if prv == false then
626                    -- ignore this one
627                    prev   = false
628                    ignore = true
629                    break
630                elseif prv == true then
631                    -- skip this one
632                else
633                    prev = prv
634                end
635            end
636            if prev then
637                literal = setstate(makecontent(start,prev,id,specification))
638            elseif ignore then
639                literal = setstate(makeignore(specification))
640            else
641                -- maybe also ignore or maybe better: comment or so
642            end
643
644            if literal then
645                local left,right
646                if trace_info then
647                    local name = specification.tagname
648                    if name then
649                        if not visualize then
650                            visualize = nodes.visualizers.register("tags")
651                        end
652                        left  = visualize(name)
653                        right = visualize()
654                    end
655                end
656                inject(start,stop,list,literal,left,right)
657            end
658
659            top    = taglist
660            noftop = noftags
661
662        else
663
664            local literal = setstate(makeignore(specification))
665
666            inject(start,stop,list,literal)
667
668        end
669
670    end
671
672    finishpage()
673
674    return head
675
676end
677
678-- variant: more structure but funny collapsing in viewer
679
680-- function nodeinjections.addtags(head)
681--
682--     local last, ranges, range = nil, { }, nil
683--
684--     local function collectranges(head,list)
685--         for n, id in nextnode, head do
686--             if id == glyph_code then
687--                 local at = getattr(n,a_tagged)
688--                 if not at then
689--                     range = nil
690--                 elseif last ~= at then
691--                     range = { at, "glyph", n, n, list } -- attr id start stop list
692--                     ranges[#ranges+1] = range
693--                     last = at
694--                 elseif range then
695--                     range[4] = n -- stop
696--                 end
697--             elseif id == hlist_code or id == vlist_code then
698--                 local at = getattr(n,a_image)
699--                 if at then
700--                     local at = getattr(n,a_tagged)
701--                     if not at then
702--                         range = nil
703--                     else
704--                         ranges[#ranges+1] = { at, "image", n, n, list } -- attr id start stop list
705--                     end
706--                     last = nil
707--                 else
708--                     local nl = getlist(n)
709--                     collectranges(nl,n)
710--                 end
711--             end
712--         end
713--     end
714--
715--     initializepage()
716--
717--     collectranges(head)
718--
719--     if trace_tags then
720--         for i=1,#ranges do
721--             local range = ranges[i]
722--             local attr  = range[1]
723--             local id    = range[2]
724--             local start = range[3]
725--             local stop  = range[4]
726--             local tags  = taglist[attr]
727--             if tags then -- not ok ... only first lines
728--                 report_tags("%s => %s : %05i % t",tosequence(start,start),tosequence(stop,stop),attr,tags.taglist)
729--             end
730--         end
731--     end
732--
733--     local top    = nil
734--     local noftop = 0
735--     local last   = nil
736--
737--     for i=1,#ranges do
738--         local range         = ranges[i]
739--         local attr          = range[1]
740--         local id            = range[2]
741--         local start         = range[3]
742--         local stop          = range[4]
743--         local list          = range[5]
744--         local specification = taglist[attr]
745--         local taglist       = specification.taglist
746--         local noftags       = #taglist
747--         local tag           = nil
748--         local common        = 0
749--      -- local prev          = root
750--
751--         if top then
752--             for i=1,noftags >= noftop and noftop or noftags do
753--                 if top[i] == taglist[i] then
754--                     common = i
755--                 else
756--                     break
757--                 end
758--             end
759--         end
760--
761--         local result        = { }
762--         local r             = noftop - common
763--         if r > 0 then
764--             for i=1,r do
765--                 result[i] = "EMC"
766--             end
767--         end
768--
769--         local prev   = common > 0 and elements[taglist[common]] or root
770--
771--         for j=common+1,noftags do
772--             local tag = taglist[j]
773--             local prv = elements[tag] or makeelement(tag,prev)
774--          -- if prv == false then
775--          --     -- ignore this one
776--          --     prev = false
777--          --     break
778--          -- elseif prv == true then
779--          --     -- skip this one
780--          -- else
781--                 prev = prv
782--                 r = r + 1
783--                 result[r] = makecontent(start,prev,id)
784--          -- end
785--         end
786--
787--         if r > 0 then
788--             local literal = setstate(concat(result,"\n"))
789--             -- use insert instead:
790--             local literal = setstate(result)
791--             local prev = getprev(start)
792--             if prev then
793--                 setlink(prev,literal)
794--             end
795--             setlink(literal,start)
796--             if list and getlist(list) == start then
797--                 setlist(list,literal)
798--             end
799--         end
800--
801--         top    = taglist
802--         noftop = noftags
803--         last   = stop
804--
805--     end
806--
807--     if last and noftop > 0 then
808--         local result = { }
809--         for i=1,noftop do
810--             result[i] = "EMC"
811--         end
812--         local literal = setstate(concat(result,"\n"))
813--         -- use insert instead:
814--         local next = getnext(last)
815--         if next then
816--             setlink(literal,next)
817--         end
818--         setlink(last,literal)
819--     end
820--
821--     finishpage()
822--
823--     return head
824--
825-- end
826
827-- this belongs elsewhere (export is not pdf related)
828
829local permitted = true
830local enabled   = false
831
832function codeinjections.settaggingsupport(option)
833    if option == false then
834        if enabled then
835            disableaction("shipouts","structures.tags.handler")
836         -- disableaction("shipouts","nodes.handlers.accessibility") -- maybe not this one
837            disableaction("math","noads.handlers.tags")
838            enabled = false
839        end
840        if permitted then
841            if trace_tags then
842                report_tags("blocking structure tags")
843            end
844            permitted = false
845        end
846    end
847end
848
849function codeinjections.enabletags()
850    if permitted and not enabled then
851        structures.tags.handler = nodeinjections.addtags
852        enableaction("shipouts","structures.tags.handler")
853     -- enableaction("shipouts","nodes.handlers.accessibility")
854        enableaction("math","noads.handlers.tags")
855        -- maybe also textblock
856        if trace_tags then
857            report_tags("enabling structure tags")
858        end
859        enabled = true
860    end
861end
862