lpdf-tag.lua /size: 22 Kb    last modification: 2020-07-01 14:35
1if not modules then modules = { } end modules ['lpdf-tag'] = {
2    version   = 1.001,
3    comment   = "companion to lpdf-tag.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local next, type = next, type
10local format, match, gmatch = string.format, string.match, string.gmatch
11local concat, sortedhash = table.concat, table.sortedhash
12local lpegmatch, P, S, C = lpeg.match, lpeg.P, lpeg.S, lpeg.C
13local settings_to_hash = utilities.parsers.settings_to_hash
14local formatters = string.formatters
15
16local trace_tags = false  trackers.register("structures.tags",      function(v) trace_tags = v end)
17local trace_info = false  trackers.register("structures.tags.info", function(v) trace_info = v end)
18
19local report_tags = logs.reporter("backend","tags")
20
21local backends            = backends
22local lpdf                = lpdf
23local nodes               = nodes
24
25local nodeinjections      = backends.pdf.nodeinjections
26local codeinjections      = backends.pdf.codeinjections
27
28local enableaction        = nodes.tasks.enableaction
29local disableaction       = nodes.tasks.disableaction
30
31local pdfdictionary       = lpdf.dictionary
32local pdfarray            = lpdf.array
33local pdfboolean          = lpdf.boolean
34local pdfconstant         = lpdf.constant
35local pdfreference        = lpdf.reference
36local pdfunicode          = lpdf.unicode
37local pdfflushobject      = lpdf.flushobject
38local pdfreserveobject    = lpdf.reserveobject
39local pdfpagereference    = lpdf.pagereference
40local pdfmakenametree     = lpdf.makenametree
41
42local addtocatalog        = lpdf.addtocatalog
43local addtopageattributes = lpdf.addtopageattributes
44
45local texgetcount         = tex.getcount
46
47local nodecodes           = nodes.nodecodes
48
49local hlist_code          = nodecodes.hlist
50local vlist_code          = nodecodes.vlist
51local glyph_code          = nodecodes.glyph
52
53local a_tagged            = attributes.private('tagged')
54local a_image             = attributes.private('image')
55
56local nuts                = nodes.nuts
57
58local nodepool            = nuts.pool
59local pageliteral         = nodepool.pageliteral
60local register            = nodepool.register
61
62local getid               = nuts.getid
63local getattr             = nuts.getattr
64local getprev             = nuts.getprev
65local getnext             = nuts.getnext
66local getlist             = nuts.getlist
67local getchar             = nuts.getchar
68
69local setlink             = nuts.setlink
70local setlist             = nuts.setlist
71
72local copy_node           = nuts.copy
73local tosequence          = nuts.tosequence
74
75local nextnode            = nuts.traversers.node
76
77local structure_kids   -- delayed
78local structure_ref    -- delayed
79local parent_ref       -- delayed
80local root             -- delayed
81local names               = { }
82local tree                = { }
83local elements            = { }
84
85local structurestags      = structures.tags
86local taglist             = structurestags.taglist
87local specifications      = structurestags.specifications
88local usedlabels          = structurestags.labels
89local properties          = structurestags.properties
90local usewithcare         = structurestags.usewithcare
91
92local usedmapping         = { }
93
94----- tagsplitter         = structurestags.patterns.splitter
95
96local embeddedtags        = false -- true will id all, for tracing, otherwise table
97local f_tagid             = formatters["%s-%04i"]
98local embeddedfilelist    = pdfarray() -- /AF crap
99
100-- for testing, not that it was ever used:
101
102directives.register("structures.tags.embed",function(v)
103    if type(v) == "string" then
104        if type(embeddedtags) ~= "table" then
105            embeddedtags = { }
106        end
107        for s in gmatch(v,"([^, ]+)") do
108            embeddedtags[s] = true
109        end
110    elseif v and not embeddedtags then
111        embeddedtags = true
112    end
113end)
114
115-- for old times sake, not that it was ever used:
116
117directives.register("structures.tags.embedmath",function(v)
118    if not v then
119        -- only enable
120    elseif embeddedtags == true then
121        -- already all tagged
122    elseif embeddedtags then
123        embeddedtags.math = true
124    else
125        embeddedtags = { math = true }
126    end
127end)
128
129function codeinjections.maptag(original,target,kind)
130    mapping[original] = { target, kind or "inline" }
131end
132
133-- mostly the same as the annotations tree
134
135local function finishstructure()
136    if root and #structure_kids > 0 then
137        local nums   = pdfarray()
138        local n      = 0
139        for i=1,#tree do
140            n = n + 1 ; nums[n] = i - 1
141            n = n + 1 ; nums[n] = pdfreference(pdfflushobject(tree[i]))
142        end
143        local parenttree = pdfdictionary {
144            Nums = nums
145        }
146        local idtree = pdfmakenametree(names)
147        --
148        local rolemap = pdfdictionary()
149        for k, v in next, usedmapping do
150            k = usedlabels[k] or k
151            local p = properties[k]
152            rolemap[k] = pdfconstant(p and p.pdf or "Span") -- or "Div"
153        end
154        local structuretree = pdfdictionary {
155            Type       = pdfconstant("StructTreeRoot"),
156            K          = pdfreference(pdfflushobject(structure_kids)),
157            ParentTree = pdfreference(pdfflushobject(parent_ref,parenttree)),
158            IDTree     = idtree,
159            RoleMap    = rolemap, -- sorted ?
160        }
161        pdfflushobject(structure_ref,structuretree)
162        addtocatalog("StructTreeRoot",pdfreference(structure_ref))
163        --
164        if lpdf.majorversion() == 1 then
165            local markinfo = pdfdictionary {
166                Marked         = pdfboolean(true) or nil,
167             -- UserProperties = pdfboolean(true), -- maybe some day
168             -- Suspects       = pdfboolean(true) or nil,
169             -- AF             = #embeddedfilelist > 0 and pdfreference(pdfflushobject(embeddedfilelist)) or nil,
170            }
171            addtocatalog("MarkInfo",pdfreference(pdfflushobject(markinfo)))
172        end
173        --
174        for fulltag, element in sortedhash(elements) do -- sorting is easier on comparing pdf
175            pdfflushobject(element.knum,element.kids)
176        end
177    end
178end
179
180lpdf.registerdocumentfinalizer(finishstructure,"document structure")
181
182local index, pageref, pagenum, list = 0, nil, 0, nil
183
184local pdf_mcr            = pdfconstant("MCR")
185local pdf_struct_element = pdfconstant("StructElem")
186local pdf_s              = pdfconstant("S")
187
188local function initializepage()
189    index   = 0
190    pagenum = texgetcount("realpageno")
191    pageref = pdfreference(pdfpagereference(pagenum))
192    list    = pdfarray()
193    tree[pagenum] = list -- we can flush after done, todo
194end
195
196local function finishpage()
197    -- flush what can be flushed
198    addtopageattributes("StructParents",pagenum-1)
199    -- there might be more
200    addtopageattributes("Tabs",s)
201end
202
203-- here we can flush and free elements that are finished
204
205local pdf_userproperties = pdfconstant("UserProperties")
206
207-- /O /Table
208-- /Headers [ ]
209
210local function makeattribute(t)
211    if t and next(t) then
212        local properties = pdfarray()
213        for k, v in sortedhash(t) do -- easier on comparing pdf
214            properties[#properties+1] = pdfdictionary {
215                N = pdfunicode(k),
216                V = pdfunicode(v),
217            }
218        end
219        return pdfdictionary {
220            O = pdf_userproperties,
221            P = properties,
222        }
223    end
224end
225
226local function makeelement(fulltag,parent)
227    local specification = specifications[fulltag]
228    local tagname       = specification.tagname
229    local tagnameused   = tagname
230    local attributes    = nil
231    if tagname == "ignore" then
232        return false
233    elseif tagname == "mstackertop" or tagname == "mstackerbot" or tagname == "mstackermid"then
234        -- TODO
235        return true
236    elseif tagname == "tabulatecell" then
237        local d = structurestags.gettabulatecell(fulltag)
238        if d and d.kind == 1 then
239            tagnameused = "tabulateheadcell"
240        end
241    elseif tagname == "tablecell" then
242        -- will become a plugin model
243        local d = structurestags.gettablecell(fulltag)
244        if d then
245            if d.kind == 1 then
246                tagnameused = "tableheadcell"
247            end
248            local rows = d.rows    or 1
249            local cols = d.columns or 1
250            if rows > 1 or cols > 1 then
251                attributes = pdfdictionary {
252                    O       = pdfconstant("Table"),
253                    RowSpan = rows > 1 and rows or nil,
254                    ColSpan = cols > 1 and cols or nil,
255                }
256            end
257
258        end
259    end
260    --
261    local detail   = specification.detail
262    local userdata = specification.userdata
263    --
264    usedmapping[tagname] = true
265    --
266    -- specification.attribute is unique
267    --
268    local id = nil
269    local af = nil
270    if embeddedtags then
271        local tagindex = specification.tagindex
272        if embeddedtags == true or embeddedtags[tagname] then
273            id = f_tagid(tagname,tagindex)
274            af = job.fileobjreferences.collected[id]
275            if af then
276                local r = pdfreference(af)
277                af = pdfarray { r }
278             -- embeddedfilelist[#embeddedfilelist+1] = r
279            end
280        end
281    end
282    --
283    local k = pdfarray()
284    local r = pdfreserveobject()
285    local t = usedlabels[tagnameused] or tagnameused
286 -- local a = nil
287    local d = pdfdictionary {
288        Type       = pdf_struct_element,
289        S          = pdfconstant(t),
290        ID         = id,
291        T          = detail and detail or nil,
292        P          = parent.pref,
293        Pg         = pageref,
294        K          = pdfreference(r),
295     -- A          = a and makeattribute(a) or nil,
296        A          = attributes,
297     -- Alt        = " Who cares ",
298     -- ActualText = " Hi Hans ",
299        AF         = af,
300    }
301    local s = pdfreference(pdfflushobject(d))
302    if id and names then
303        names[id] = s
304    end
305    local kids = parent.kids
306    kids[#kids+1] = s
307    local e = {
308        tag  = t,
309        pref = s,
310        kids = k,
311        knum = r,
312        pnum = pagenum
313    }
314    elements[fulltag] = e
315    return e
316end
317
318local f_BDC = formatters["/%s <</MCID %s>> BDC"]
319
320local function makecontent(parent,id,specification)
321    local tag  = parent.tag
322    local kids = parent.kids
323    local last = index
324    if id == "image" then
325        local list  = specification.taglist
326        local data  = usewithcare.images[list[#list]]
327        local label = data and data.label
328        local d = pdfdictionary {
329            Type = pdf_mcr,
330            Pg   = pageref,
331            MCID = last,
332            Alt  = pdfunicode(label ~= "" and label or "image"),
333        }
334        kids[#kids+1] = d
335    elseif pagenum == parent.pnum then
336        kids[#kids+1] = last
337    else
338        local d = pdfdictionary {
339            Type = pdf_mcr,
340            Pg   = pageref,
341            MCID = last,
342        }
343     -- kids[#kids+1] = pdfreference(pdfflushobject(d))
344        kids[#kids+1] = d
345    end
346    --
347    index = index + 1
348    list[index] = parent.pref -- page related list
349    --
350    return f_BDC(tag,last)
351end
352
353local function makeignore(specification)
354    return "/Artifact BMC"
355end
356
357-- no need to adapt head, as we always operate on lists
358
359local EMCliteral = nil
360local visualize  = nil
361
362function nodeinjections.addtags(head)
363
364    if not EMCliteral then
365        EMCliteral = register(pageliteral("EMC"))
366    end
367
368    local last   = nil
369    local ranges = { }
370    local range  = nil
371
372    if not root then
373        structure_kids = pdfarray()
374        structure_ref  = pdfreserveobject()
375        parent_ref     = pdfreserveobject()
376        root           = { pref = pdfreference(structure_ref), kids = structure_kids }
377        names          = pdfarray()
378    end
379
380    local function collectranges(head,list)
381        for n, id in nextnode, head do
382            if id == glyph_code then
383                -- maybe also disc
384if getchar(n) ~= 0 then
385                local at = getattr(n,a_tagged) or false -- false: pagebody or so, so artifact
386             -- if not at then
387             --     range = nil
388             -- elseif ...
389                if last ~= at then
390                    range = { at, "glyph", n, n, list } -- attr id start stop list
391                    ranges[#ranges+1] = range
392                    last = at
393                elseif range then
394                    range[4] = n -- stop
395                end
396end
397            elseif id == hlist_code or id == vlist_code then
398                local at = getattr(n,a_image)
399                if at then
400                    local at = getattr(n,a_tagged) or false -- false: pagebody or so, so artifact
401                 -- if not at then
402                 --     range = nil
403                 -- else
404                        ranges[#ranges+1] = { at, "image", n, n, list } -- attr id start stop list
405                 -- end
406                    last = nil
407                else
408                    local list = getlist(n)
409                    if list then
410                        collectranges(list,n)
411                    end
412                end
413            end
414        end
415    end
416
417    initializepage()
418
419    collectranges(head)
420
421    if trace_tags then
422        for i=1,#ranges do
423            local range = ranges[i]
424            local attr  = range[1]
425            local id    = range[2]
426            local start = range[3]
427            local stop  = range[4]
428            local tags  = taglist[attr]
429            if tags then -- not ok ... only first lines
430                report_tags("%s => %s : %05i % t",tosequence(start,start),tosequence(stop,stop),attr,tags.taglist)
431            end
432        end
433    end
434
435    local top    = nil
436    local noftop = 0
437
438    local function inject(start,stop,list,literal,left,right)
439        local prev = getprev(start)
440        if prev then
441            setlink(prev,literal)
442        end
443        if left then
444            setlink(literal,left,start)
445        else
446            setlink(literal,start)
447        end
448        if list and not prev then
449            setlist(list,literal)
450        end
451        local literal = copy_node(EMCliteral)
452        -- use insert instead:
453        local next = getnext(stop)
454        if next then
455            setlink(literal,next)
456        end
457        if right then
458            setlink(stop,right,literal)
459        else
460            setlink(stop,literal)
461        end
462    end
463
464    for i=1,#ranges do
465
466        local range = ranges[i]
467        local attr  = range[1]
468        local id    = range[2]
469        local start = range[3]
470        local stop  = range[4]
471        local list  = range[5]
472
473        if attr then
474
475            local specification = taglist[attr]
476            local taglist       = specification.taglist
477            local noftags       = #taglist
478            local common        = 0
479            local literal       = nil
480            local ignore        = false
481
482            if top then
483                for i=1,noftags >= noftop and noftop or noftags do
484                    if top[i] == taglist[i] then
485                        common = i
486                    else
487                        break
488                    end
489                end
490            end
491
492            local prev = common > 0 and elements[taglist[common]] or root
493
494            for j=common+1,noftags do
495                local tag = taglist[j]
496                local prv = elements[tag] or makeelement(tag,prev)
497                if prv == false then
498                    -- ignore this one
499                    prev   = false
500                    ignore = true
501                    break
502                elseif prv == true then
503                    -- skip this one
504                else
505                    prev = prv
506                end
507            end
508            if prev then
509                literal = pageliteral(makecontent(prev,id,specification))
510            elseif ignore then
511                literal = pageliteral(makeignore(specification))
512            else
513                -- maybe also ignore or maybe better: comment or so
514            end
515
516            if literal then
517                local left,right
518                if trace_info then
519                    local name = specification.tagname
520                    if name then
521                        if not visualize then
522                            visualize = nodes.visualizers.register("tags")
523                        end
524                        left  = visualize(name)
525                        right = visualize()
526                    end
527                end
528                inject(start,stop,list,literal,left,right)
529            end
530
531            top    = taglist
532            noftop = noftags
533
534        else
535
536            local literal = pageliteral(makeignore(specification))
537
538            inject(start,stop,list,literal)
539
540        end
541
542    end
543
544    finishpage()
545
546    return head
547
548end
549
550-- variant: more structure but funny collapsing in viewer
551
552-- function nodeinjections.addtags(head)
553--
554--     local last, ranges, range = nil, { }, nil
555--
556--     local function collectranges(head,list)
557--         for n, id in nextnode, head do
558--             if id == glyph_code then
559--                 local at = getattr(n,a_tagged)
560--                 if not at then
561--                     range = nil
562--                 elseif last ~= at then
563--                     range = { at, "glyph", n, n, list } -- attr id start stop list
564--                     ranges[#ranges+1] = range
565--                     last = at
566--                 elseif range then
567--                     range[4] = n -- stop
568--                 end
569--             elseif id == hlist_code or id == vlist_code then
570--                 local at = getattr(n,a_image)
571--                 if at then
572--                     local at = getattr(n,a_tagged)
573--                     if not at then
574--                         range = nil
575--                     else
576--                         ranges[#ranges+1] = { at, "image", n, n, list } -- attr id start stop list
577--                     end
578--                     last = nil
579--                 else
580--                     local nl = getlist(n)
581--                     collectranges(nl,n)
582--                 end
583--             end
584--         end
585--     end
586--
587--     initializepage()
588--
589--     collectranges(head)
590--
591--     if trace_tags then
592--         for i=1,#ranges do
593--             local range = ranges[i]
594--             local attr  = range[1]
595--             local id    = range[2]
596--             local start = range[3]
597--             local stop  = range[4]
598--             local tags  = taglist[attr]
599--             if tags then -- not ok ... only first lines
600--                 report_tags("%s => %s : %05i % t",tosequence(start,start),tosequence(stop,stop),attr,tags.taglist)
601--             end
602--         end
603--     end
604--
605--     local top    = nil
606--     local noftop = 0
607--     local last   = nil
608--
609--     for i=1,#ranges do
610--         local range         = ranges[i]
611--         local attr          = range[1]
612--         local id            = range[2]
613--         local start         = range[3]
614--         local stop          = range[4]
615--         local list          = range[5]
616--         local specification = taglist[attr]
617--         local taglist       = specification.taglist
618--         local noftags       = #taglist
619--         local tag           = nil
620--         local common        = 0
621--      -- local prev          = root
622--
623--         if top then
624--             for i=1,noftags >= noftop and noftop or noftags do
625--                 if top[i] == taglist[i] then
626--                     common = i
627--                 else
628--                     break
629--                 end
630--             end
631--         end
632--
633--         local result        = { }
634--         local r             = noftop - common
635--         if r > 0 then
636--             for i=1,r do
637--                 result[i] = "EMC"
638--             end
639--         end
640--
641--         local prev   = common > 0 and elements[taglist[common]] or root
642--
643--         for j=common+1,noftags do
644--             local tag = taglist[j]
645--             local prv = elements[tag] or makeelement(tag,prev)
646--          -- if prv == false then
647--          --     -- ignore this one
648--          --     prev = false
649--          --     break
650--          -- elseif prv == true then
651--          --     -- skip this one
652--          -- else
653--                 prev = prv
654--                 r = r + 1
655--                 result[r] = makecontent(prev,id)
656--          -- end
657--         end
658--
659--         if r > 0 then
660--             local literal = pageliteral(concat(result,"\n"))
661--             -- use insert instead:
662--             local literal = pageliteral(result)
663--             local prev = getprev(start)
664--             if prev then
665--                 setlink(prev,literal)
666--             end
667--             setlink(literal,start)
668--             if list and getlist(list) == start then
669--                 setlist(list,literal)
670--             end
671--         end
672--
673--         top    = taglist
674--         noftop = noftags
675--         last   = stop
676--
677--     end
678--
679--     if last and noftop > 0 then
680--         local result = { }
681--         for i=1,noftop do
682--             result[i] = "EMC"
683--         end
684--         local literal = pageliteral(concat(result,"\n"))
685--         -- use insert instead:
686--         local next = getnext(last)
687--         if next then
688--             setlink(literal,next)
689--         end
690--         setlink(last,literal)
691--     end
692--
693--     finishpage()
694--
695--     return head
696--
697-- end
698
699-- this belongs elsewhere (export is not pdf related)
700
701local permitted = true
702local enabled   = false
703
704function codeinjections.settaggingsupport(option)
705    if option == false then
706        if enabled then
707            disableaction("shipouts","structures.tags.handler")
708            disableaction("shipouts","nodes.handlers.accessibility") -- maybe not this one
709            disableaction("math","noads.handlers.tags")
710            enabled = false
711        end
712        if permitted then
713            if trace_tags then
714                report_tags("blocking structure tags")
715            end
716            permitted = false
717        end
718    end
719end
720
721function codeinjections.enabletags()
722    if permitted and not enabled then
723        structures.tags.handler = nodeinjections.addtags
724        enableaction("shipouts","structures.tags.handler")
725        enableaction("shipouts","nodes.handlers.accessibility")
726        enableaction("math","noads.handlers.tags")
727        -- maybe also textblock
728        if trace_tags then
729            report_tags("enabling structure tags")
730        end
731        enabled = true
732    end
733end
734