lpdf-pde.lua /size: 40 Kb    last modification: 2020-07-01 14:35
1if not modules then modules = { } end modules ['lpdf-epd'] = {
2    version   = 1.001,
3    comment   = "companion to lpdf-epa.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7    history   = "this one replaces the poppler/pdfe binding",
8}
9
10-- \enabledirectives[graphics.pdf.uselua]
11-- \enabledirectives[graphics.pdf.recompress]
12-- \enabledirectives[graphics.pdf.stripmarked]
13
14-- maximum integer : +2^32
15-- maximum real    : +2^15
16-- minimum real    : 1/(2^16)
17
18-- get_flagged : does that still work
19
20-- ppdoc_permissions (ppdoc *pdf);
21
22-- PPSTRING_ENCODED        1 <<  0
23-- PPSTRING_DECODED        1 <<  1
24-- PPSTRING_EXEC           1 <<  2   postscript only
25-- PPSTRING_PLAIN                0
26-- PPSTRING_BASE16         1 <<  3
27-- PPSTRING_BASE85         1 <<  4
28-- PPSTRING_UTF16BE        1 <<  5
29-- PPSTRING_UTF16LE        1 <<  6
30
31-- PPDOC_ALLOW_PRINT       1 <<  2   printing
32-- PPDOC_ALLOW_MODIFY      1 <<  3   filling form fields, signing, creating template pages
33-- PPDOC_ALLOW_COPY        1 <<  4   copying, copying for accessibility
34-- PPDOC_ALLOW_ANNOTS      1 <<  5   filling form fields, copying, signing
35-- PPDOC_ALLOW_EXTRACT     1 <<  9   contents copying for accessibility
36-- PPDOC_ALLOW_ASSEMBLY    1 << 10   no effect
37-- PPDOC_ALLOW_PRINT_HIRES 1 << 11   no effect
38
39-- PPCRYPT_NONE                  0   no encryption, go ahead
40-- PPCRYPT_DONE                  1   encryption present but password succeeded, go ahead
41-- PPCRYPT_PASS                 -1   encryption present, need non-empty password
42-- PPCRYPT_FAIL                 -2   invalid or unsupported encryption (eg. undocumented in pdf spec)
43
44local setmetatable, type, next = setmetatable, type, next
45local tostring, tonumber, unpack = tostring, tonumber, unpack
46local char, byte, find = string.char, string.byte, string.find
47local abs = math.abs
48local concat, swapped, sortedhash, sortedkeys = table.concat, table.swapped, table.sortedhash, table.sortedkeys
49local utfchar = string.char
50local setmetatableindex = table.setmetatableindex
51local ioopen = io.open
52
53local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
54local P, C, S, R, Ct, Cc, V, Carg, Cs, Cf, Cg = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs, lpeg.Cf, lpeg.Cg
55
56if not lpdf then
57    require("lpdf-aux")
58end
59
60if not (number and number.dimenfactors) then
61    require("util-dim")
62end
63
64local pdfe              = pdfe
65      lpdf              = lpdf or { }
66local lpdf              = lpdf
67local lpdf_epdf         = { }
68      lpdf.epdf         = lpdf_epdf
69
70local pdfopen           = pdfe.open
71local pdfopenfile       = pdfe.openfile
72local pdfnew            = pdfe.new
73local pdfclose          = pdfe.close
74
75local getcatalog        = pdfe.getcatalog
76local getinfo           = pdfe.getinfo
77local gettrailer        = pdfe.gettrailer
78local getnofpages       = pdfe.getnofpages
79local getversion        = pdfe.getversion
80local getbox            = pdfe.getbox
81local getstatus         = pdfe.getstatus
82local unencrypt         = pdfe.unencrypt
83
84local dictionarytotable = pdfe.dictionarytotable
85local arraytotable      = pdfe.arraytotable
86local pagestotable      = pdfe.pagestotable
87local readwholestream   = pdfe.readwholestream
88
89local getfromreference  = pdfe.getfromreference
90
91local report_epdf       = logs.reporter("epdf")
92
93local allocate          = utilities.storage.allocate
94
95local bpfactor          = number.dimenfactors.bp
96
97local objectcodes = { [0] =
98    "none",
99    "null",
100    "bool",
101    "integer",
102    "number",
103    "name",
104    "string",
105    "array",
106    "dictionary",
107    "stream",
108    "reference",
109}
110
111local encryptioncodes = {
112     [0] = "notencrypted",
113     [1] = "unencrypted",
114    [-1] = "protected",
115    [-2] = "failure",
116}
117
118objectcodes                  = allocate(swapped(objectcodes,objectcodes))
119encryptioncodes              = allocate(swapped(encryptioncodes,encryptioncodes))
120
121pdfe.objectcodes             = objectcodes
122pdfe.encryptioncodes         = encryptioncodes
123
124local null_object_code       = objectcodes.null
125local reference_object_code  = objectcodes.reference
126
127local none_object_code       = objectcodes.none
128local null_object_code       = objectcodes.null
129local bool_object_code       = objectcodes.bool
130local integer_object_code    = objectcodes.integer
131local number_object_code     = objectcodes.number
132local name_object_code       = objectcodes.name
133local string_object_code     = objectcodes.string
134local array_object_code      = objectcodes.array
135local dictionary_object_code = objectcodes.dictionary
136local stream_object_code     = objectcodes.stream
137local reference_object_code  = objectcodes.reference
138
139local checked_access
140local get_flagged     -- from pdfe -> lpdf
141
142if lpdf.dictionary then
143
144    -- we're in context
145
146    local pdfdictionary = lpdf.dictionary
147    local pdfarray      = lpdf.array
148    local pdfconstant   = lpdf.constant
149    local pdfstring     = lpdf.string
150    local pdfunicode    = lpdf.unicode
151
152    get_flagged = function(t,f,k)
153        local tk = t[k] -- triggers resolve
154        local fk = f[k]
155        if not fk then
156            return tk
157        elseif fk == "name" then
158            return pdfconstant(tk)
159        elseif fk == "array" then
160            return pdfarray(tk)
161        elseif fk == "dictionary" then
162            return pdfarray(tk)
163        elseif fk == "rawtext" then
164            return pdfstring(tk)
165        elseif fk == "unicode" then
166            return pdfunicode(tk)
167        else
168            return tk
169        end
170    end
171
172else
173
174    get_flagged = function(t,f,k)
175        return t[k]
176    end
177
178end
179
180-- We need to convert the string from utf16 although there is no way to
181-- check if we have a regular string starting with a bom. So, we have
182-- na dilemma here: a pdf doc encoded string can be invalid utf.
183
184-- <hex encoded>   : implicit 0 appended if odd
185-- (byte encoded)  : \( \) \\ escaped
186--
187-- <FE><FF> : utf16be
188--
189-- \r \r \t \b \f \( \) \\ \NNN and \<newline> : append next line
190--
191-- the getString function gives back bytes so we don't need to worry about
192-- the hex aspect.
193
194local some_dictionary
195local some_array
196local some_stream
197local some_reference
198
199local some_string = lpdf.frombytes
200
201local function get_value(document,t,key)
202    if not key then
203        return
204    end
205    local value = t[key]
206    if not value then
207        return
208    end
209    if type(value) ~= "table" then
210        return value
211    end
212    -- we can assume names to be simple and strings to be tables
213    local kind = value[1]
214    if kind == name_object_code then
215        return value[2]
216    elseif kind == string_object_code then
217        return some_string(value[2],value[3])
218    elseif kind == array_object_code then
219        return some_array(value[2],document)
220    elseif kind == dictionary_object_code then
221        return some_dictionary(value[2],document)
222    elseif kind == stream_object_code then
223        return some_stream(value,document)
224    elseif kind == reference_object_code then
225        return some_reference(value,document)
226    end
227    return value
228end
229
230some_dictionary = function (d,document)
231    local f = dictionarytotable(d,true)
232    local t = setmetatable({ __raw__ = f, __type__ = dictionary_object_code }, {
233       __index = function(t,k)
234           return get_value(document,f,k)
235       end,
236       __call = function(t,k)
237           return get_flagged(t,f,k)
238       end,
239    } )
240    return t, "dictionary"
241end
242
243some_array = function (a,document)
244    local f = arraytotable(a,true)
245    local n = #f
246    local t = setmetatable({ __raw__ = f, __type__ = array_object_code, n = n }, {
247        __index = function(t,k)
248            return get_value(document,f,k)
249        end,
250        __call = function(t,k)
251            return get_flagged(t,f,k)
252        end,
253        __len = function(t,k)
254            return n
255        end,
256    } )
257    return t, "array"
258end
259
260some_stream = function(s,d,document)
261    local f = dictionarytotable(d,true)
262    local t = setmetatable({ __raw__ = f, __type__ = stream_object_code }, {
263        __index = function(t,k)
264            return get_value(document,f,k)
265        end,
266        __call = function(t,raw)
267            if raw == false then
268                return readwholestream(s,false) -- original
269            else
270                return readwholestream(s,true)  -- uncompressed
271            end
272        end,
273    } )
274    return t, "stream"
275end
276
277some_reference = function(r,document)
278    local objnum = r[3]
279    local cached = document.__cache__[objnum]
280    if not cached then
281        local kind, object, b, c = getfromreference(r[2])
282        if kind == dictionary_object_code then
283            cached = some_dictionary(object,document)
284        elseif kind == array_object_code then
285            cached = some_array(object,document)
286        elseif kind == stream_object_code then
287            cached = some_stream(object,b,document)
288        else
289            cached = { kind, object, b, c }
290            -- really cache this?
291        end
292        document.__cache__[objnum] = cached
293        document.__xrefs__[cached] = objnum
294    end
295    return cached
296end
297
298local resolvers     = { }
299lpdf_epdf.resolvers = resolvers
300
301local function resolve(document,k)
302    local resolver = resolvers[k]
303    if resolver then
304        local entry = resolver(document)
305        document[k] = entry
306        return entry
307    end
308end
309
310local function getnames(document,n,target) -- direct
311    if n then
312        local Names = n.Names
313        if Names then
314            if not target then
315                target = { }
316            end
317            for i=1,#Names,2 do
318                target[Names[i]] = Names[i+1]
319            end
320        else
321            local Kids = n.Kids
322            if Kids then
323                for i=1,#Kids do
324                    target = getnames(document,Kids[i],target)
325                end
326            end
327        end
328        return target
329    end
330end
331
332local function getkids(document,n,target) -- direct
333    if n then
334        local Kids = n.Kids
335        if Kids then
336            for i=1,#Kids do
337                target = getkids(document,Kids[i],target)
338            end
339        elseif target then
340            target[#target+1] = n
341        else
342            target = { n }
343        end
344        return target
345    end
346end
347
348function resolvers.destinations(document)
349    local Names = document.Catalog.Names
350    return getnames(document,Names and Names.Dests)
351end
352
353function resolvers.javascripts(document)
354    local Names = document.Catalog.Names
355    return getnames(document,Names and Names.JavaScript)
356end
357
358function resolvers.widgets(document)
359    local Names = document.Catalog.AcroForm
360    return Names and Names.Fields
361end
362
363function resolvers.embeddedfiles(document)
364    local Names = document.Catalog.Names
365    return getnames(document,Names and Names.EmbeddedFiles)
366end
367
368-- /OCProperties <<
369--     /OCGs [ 15 0 R 17 0 R 19 0 R 21 0 R 23 0 R 25 0 R 27 0 R ]
370--     /D <<
371--         /Order [ 15 0 R 17 0 R 19 0 R 21 0 R 23 0 R 25 0 R 27 0 R ]
372--         /ON    [ 15 0 R 17 0 R 19 0 R 21 0 R 23 0 R 25 0 R 27 0 R ]
373--         /OFF   [ ]
374--     >>
375-- >>
376
377function resolvers.layers(document)
378    local properties = document.Catalog.OCProperties
379    if properties then
380        local layers = properties.OCGs
381        if layers then
382            local t = { }
383            for i=1,#layers do
384                local layer = layers[i]
385                t[i] = layer.Name
386            end
387         -- t.n = n
388            return t
389        end
390    end
391end
392
393function resolvers.structure(document)
394    -- this might become a tree
395    return document.Catalog.StructTreeRoot
396end
397
398function resolvers.pages(document)
399    local __data__  = document.__data__
400    local __xrefs__ = document.__xrefs__
401    local __cache__ = document.__cache__
402    --
403    local nofpages = document.nofpages
404    local pages    = { }
405    local rawpages = pagestotable(__data__)
406    document.pages = pages
407    --
408    for pagenumber=1,nofpages do
409        local rawpagedata   = rawpages[pagenumber]
410        if rawpagedata then
411            local pagereference = rawpagedata[3]
412            local pageobject    = rawpagedata[1]
413            local pagedata      = some_dictionary(pageobject,document)
414            if pagedata and pageobject then
415                pagedata.number   = pagenumber
416                pagedata.MediaBox = getbox(pageobject,"MediaBox")
417                pagedata.CropBox  = getbox(pageobject,"CropBox")
418                pagedata.BleedBox = getbox(pageobject,"BleedBox")
419                pagedata.ArtBox   = getbox(pageobject,"ArtBox")
420                pagedata.TrimBox  = getbox(pageobject,"TrimBox")
421                pages[pagenumber] = pagedata
422                __xrefs__[pagedata]      = pagereference
423                __cache__[pagereference] = pagedata
424            else
425            report_epdf("missing pagedata for page %i, case %i",pagenumber,1)
426            end
427        else
428            report_epdf("missing pagedata for page %i, case %i",pagenumber,2)
429        end
430    end
431    --
432 -- pages.n = nofpages
433    --
434    return pages
435end
436
437local loaded    = { }
438local nofloaded = 0
439
440function lpdf_epdf.load(filename,userpassword,ownerpassword,fromstring)
441    local document = loaded[filename]
442    if not document then
443        statistics.starttiming(lpdf_epdf)
444        local __data__
445        local __file__
446        if fromstring then
447            __data__ = pdfnew(filename,#filename)
448        elseif pdfopenfile then
449            __data__ = pdfopenfile(ioopen(filename,"rb"))
450        else
451            __data__ = pdfopen(filename)
452        end
453        if __data__ then
454            if userpassword and getstatus(__data__) < 0 then
455                unencrypt(__data__,userpassword,nil)
456            end
457            if ownerpassword and getstatus(__data__) < 0 then
458                unencrypt(__data__,nil,ownerpassword)
459            end
460            if getstatus(__data__) < 0 then
461                report_epdf("the document is encrypted, provide proper passwords",getstatus(__data__))
462                __data__ = false
463            end
464            if __data__ then
465                document = {
466                    filename   = filename,
467                    nofcopied  = 0,
468                    copied     = { },
469                    __cache__  = { },
470                    __xrefs__  = { },
471                    __fonts__  = { },
472                    __copied__ = { },
473                    __data__   = __data__,
474                }
475                document.Catalog = some_dictionary(getcatalog(__data__),document)
476                document.Info    = some_dictionary(getinfo(__data__),document)
477                document.Trailer = some_dictionary(gettrailer(__data__),document)
478                --
479                setmetatableindex(document,resolve)
480                --
481                document.majorversion, document.minorversion = getversion(__data__)
482                --
483                document.nofpages = getnofpages(__data__)
484            else
485                document = false
486            end
487        else
488            document = false
489        end
490        loaded[filename] = document
491        loaded[document] = document
492        statistics.stoptiming(lpdf_epdf)
493     -- print(statistics.elapsedtime(lpdf_epdf))
494    end
495    if document then
496        nofloaded = nofloaded + 1
497    end
498    return document or nil
499end
500
501function lpdf_epdf.unload(filename)
502    if type(filename) == "table" then
503        filename = filename.filename
504    end
505    if type(filename) == "string" then
506        local document = loaded[filename]
507        if document then
508            loaded[document] = nil
509            loaded[filename] = nil
510            pdfclose(document.__data__)
511        end
512    end
513end
514
515-- for k, v in expanded(t) do
516
517local function expanded(t)
518    local function iterator(raw,k)
519        local k, v = next(raw,k)
520        if v then
521            return k, t[k]
522        end
523    end
524    return iterator, t.__raw__, nil
525end
526
527---------.expand   = expand
528lpdf_epdf.expanded = expanded
529
530-- we could resolve the text stream in one pass if we directly handle the
531-- font but why should we complicate things
532
533local spaces    = lpegpatterns.whitespace^1
534local optspaces = lpegpatterns.whitespace^0
535local comment   = P("%") * (1 - lpegpatterns.newline)^0
536local numchar   = P("\\")/"" * (R("09")^3/function(s) return char(tonumber(s,8)) end)
537                + P("\\") * P(1)
538local key       = P("/") * C(R("AZ","az","09","__")^1)
539local number    = Ct(Cc("number") * (lpegpatterns.number/tonumber))
540local keyword   = Ct(Cc("name") * key)
541local operator  = C((R("AZ","az")+P("*")+P("'")+P('"'))^1)
542
543local grammar   = P { "start",
544    start      = (comment + keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces)^1,
545    keyvalue   = key * optspaces * V("start"),
546    array      = Ct(Cc("array") * P("[")  * Ct(V("start")^1)         * P("]")),
547    dictionary = Ct(Cc("dict")  * P("<<") * Ct(V("keyvalue")^1)      * P(">>")),
548    hexstring  = Ct(Cc("hex")   * P("<")  * Cs((        1-P(">"))^1) * P(">")),
549    decstring  = Ct(Cc("dec")   * P("(")  * Cs((numchar+1-(P")"))^1) * P(")")), -- untested
550}
551
552local operation = Ct(grammar^1 * operator)
553local parser    = Ct((operation + P(1))^1)
554
555-- todo: speed this one up
556
557local numchar   = P("\\") * (R("09")^3 + P(1))
558local number    = lpegpatterns.number
559local keyword   = P("/") * R("AZ","az","09","__")^1
560local operator  = (R("AZ","az")+P("*")+P("'")+P('"'))^1
561
562local skipstart = P("BDC") + P("BMC") + P("DP") + P("MP")
563local skipstop  = P("EMC")
564local skipkeep  = P("/ActualText")
565
566local grammar   = P { "skip",
567    start      = keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces,
568    keyvalue   = optspaces * (keyword * optspaces * V("start") * optspaces)^1,
569    xeyvalue   = optspaces * ((keyword - skipkeep) * optspaces * V("start") * optspaces)^1,
570    array      = P("[")  * V("start")^0         * P("]"),
571    dictionary = P("<<") * V("keyvalue")^0      * P(">>"),
572    xictionary = P("<<") * V("xeyvalue")^0      * P(">>"),
573    hexstring  = P("<")  * (        1-P(">"))^0 * P(">"),
574    decstring  = P("(")  * (numchar+1-(P")"))^0 * P(")"),
575    skip       = (optspaces * ( keyword * optspaces * V("xictionary") * optspaces * skipstart + skipstop) / "")
576               + V("start")
577               + operator
578}
579
580local stripper = Cs((grammar + P(1))^1)
581
582function lpdf_epdf.parsecontent(str)
583    return lpegmatch(parser,str)
584end
585
586function lpdf_epdf.stripcontent(str)
587    if find(str,"EMC") then
588        return lpegmatch(stripper,str)
589    else
590        return str
591    end
592end
593
594-- beginbfrange : <start> <stop> <firstcode>
595--                <start> <stop> [ <firstsequence> <firstsequence> <firstsequence> ]
596-- beginbfchar  : <code> <newcodes>
597
598local fromsixteen = lpdf.fromsixteen -- maybe inline the lpeg ... but not worth it
599
600local function f_bfchar(t,a,b)
601    t[tonumber(a,16)] = fromsixteen(b)
602end
603
604local function f_bfrange_1(t,a,b,c)
605    print("todo 1",a,b,c)
606    -- c is string
607    -- todo t[tonumber(a,16)] = fromsixteen(b)
608end
609
610local function f_bfrange_2(t,a,b,c)
611    print("todo 2",a,b,c)
612    -- c is table
613    -- todo t[tonumber(a,16)] = fromsixteen(b)
614end
615
616local optionals   = spaces^0
617local hexstring   = optionals * P("<") * C((1-P(">"))^1) * P(">")
618local bfchar      = Carg(1) * hexstring * hexstring / f_bfchar
619local bfrange     = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1
620                  + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2
621local fromunicode = (
622    P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) +
623    P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") +
624    spaces +
625    P(1)
626)^1  * Carg(1)
627
628local function analyzefonts(document,resources) -- unfinished, see mtx-pdf for better code
629    local fonts = document.__fonts__
630    if resources then
631        local fontlist = resources.Font
632        if fontlist then
633            for id, data in expanded(fontlist) do
634                if not fonts[id] then
635                    --  a quick hack ... I will look into it more detail if I find a real
636                    -- -application for it
637                    local tounicode = data.ToUnicode()
638                    if tounicode then
639                        tounicode = lpegmatch(fromunicode,tounicode,1,{})
640                    end
641                    fonts[id] = {
642                        tounicode = type(tounicode) == "table" and tounicode or { }
643                    }
644                    setmetatableindex(fonts[id],"self")
645                end
646            end
647        end
648    end
649    return fonts
650end
651
652lpdf_epdf.analyzefonts = analyzefonts
653
654local more = 0
655local unic = nil -- cheaper than passing each time as Carg(1)
656
657local p_hex_to_utf = C(4) / function(s) -- needs checking !
658    local now = tonumber(s,16)
659    if more > 0 then
660        now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong
661        more = 0
662        return unic[now] or utfchar(now)
663    elseif now >= 0xD800 and now <= 0xDBFF then
664        more = now
665     -- return ""
666    else
667        return unic[now] or utfchar(now)
668    end
669end
670
671local p_dec_to_utf = C(1) / function(s) -- needs checking !
672    local now = byte(s)
673    return unic[now] or utfchar(now)
674end
675
676local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1)
677local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1)
678
679function lpdf_epdf.getpagecontent(document,pagenumber)
680
681    local page = document.pages[pagenumber]
682
683    if not page then
684        return
685    end
686
687    local fonts   = analyzefonts(document,page.Resources)
688
689    local content = page.Contents() or ""
690    local list    = lpegmatch(parser,content)
691    local font    = nil
692 -- local unic    = nil
693
694    for i=1,#list do
695        local entry    = list[i]
696        local size     = #entry
697        local operator = entry[size]
698        if operator == "Tf" then
699            font = fonts[entry[1][2]]
700            unic = font and font.tounicode or { }
701        elseif operator == "TJ" then
702            local data = entry[1] -- { "array", { ... } }
703            local list = data[2]  -- { { ... }, { ... } }
704            for i=1,#list do
705                local li = list[i]
706--                 if type(li) == "table" then
707                    local kind = li[1]
708                    if kind == "hex" then
709                        list[i] = lpegmatch(p_hex_to_utf,li[2])
710                    elseif kind == "string" then
711                        list[i] = lpegmatch(p_dec_to_utf,li[2])
712                    else
713                        list[i] = li[2] -- kern
714                    end
715--                 else
716--                     -- kern
717--                 end
718            end
719        elseif operator == "Tj" or operator == "'" or operator == '"' then
720            -- { string,  Tj } { string, ' } { n, m, string, " }
721            local data = entry[size-1]
722            local list = data[2]
723            local kind = list[1]
724            if kind == "hex" then
725                list[2] = lpegmatch(p_hex_to_utf,li[2])
726            elseif kind == "string" then
727                list[2] = lpegmatch(p_dec_to_utf,li[2])
728            end
729        end
730    end
731
732    unic = nil -- can be collected
733
734    return list
735
736end
737
738-- This is also an experiment. When I really need it I can improve it, for instance
739-- with proper position calculating. It might be usefull for some search or so.
740
741local softhyphen = utfchar(0xAD) .. "$"
742local linefactor = 1.3
743
744function lpdf_epdf.contenttotext(document,list) -- maybe signal fonts
745    local last_y = 0
746    local last_f = 0
747    local text   = { }
748    local last   = 0
749
750    for i=1,#list do
751        local entry    = list[i]
752        local size     = #entry
753        local operator = entry[size]
754        if operator == "Tf" then
755            last_f = entry[2][2] -- size
756        elseif operator == "TJ" then
757            local data = entry[1] -- { "array", { ... } }
758            local list = data[2]  -- { { ... }, { ... } }
759            for i=1,#list do
760                local li = list[i]
761                local kind = type(li)
762                if kind == "string" then
763                    last = last + 1
764                    text[last] = li
765                elseif kind == "number" and li < -50 then
766                    last = last + 1
767                    text[last] = " "
768                end
769            end
770        elseif operator == "Tj" then
771            last = last + 1
772            local li = entry[size-1]
773            local kind = type(li)
774            if kind == "string" then
775                last = last + 1
776                text[last] = li
777            end
778        elseif operator == "cm" or operator == "Tm" then
779            local data = entry
780            local ty = entry[6][2]
781            local dy = abs(last_y - ty)
782            if dy > linefactor*last_f then
783                if last > 0 then
784                    if find(text[last],softhyphen,1,true) then
785                        -- ignore
786                    else
787                        last = last + 1
788                        text[last] = "\n"
789                    end
790                end
791            end
792            last_y = ty
793        end
794    end
795
796    return concat(text)
797end
798
799function lpdf_epdf.getstructure(document,list) -- just a test
800    local depth = 0
801    for i=1,#list do
802        local entry    = list[i]
803        local size     = #entry
804        local operator = entry[size]
805        if operator == "BDC" then
806            report_epdf("%w%s : %s",depth,entry[1] or "?",entry[2] and entry[2].MCID or "?")
807            depth = depth + 1
808        elseif operator == "EMC" then
809            depth = depth - 1
810        elseif operator == "TJ" then
811            local list = entry[1]
812            for i=1,#list do
813                local li = list[i]
814                if type(li) == "string" then
815                    report_epdf("%w > %s",depth,li)
816                elseif li < -50 then
817                    report_epdf("%w >",depth,li)
818                end
819            end
820        elseif operator == "Tj" then
821            report_epdf("%w > %s",depth,entry[size-1])
822        end
823    end
824end
825
826if images then do
827
828    -- This can be made a bit faster (just get raw data and pass it) but I will
829    -- do that later. In the end the benefit is probably neglectable.
830
831    local recompress           = false
832    local stripmarked          = false
833
834    local copydictionary       = nil
835    local copyarray            = nil
836
837    local pdfreserveobject     = lpdf.reserveobject
838    local shareobjectreference = lpdf.shareobjectreference
839    local pdfflushobject       = lpdf.flushobject
840    local pdfflushstreamobject = lpdf.flushstreamobject
841    local pdfreference         = lpdf.reference
842    local pdfconstant          = lpdf.constant
843    local pdfarray             = lpdf.array
844    local pdfdictionary        = lpdf.dictionary
845    local pdfnull              = lpdf.null
846    local pdfliteral           = lpdf.literal
847
848    local report               = logs.reporter("backend","xobjects")
849
850    local factor               = 65536 / (7200/7227) -- 1/number.dimenfactors.bp
851
852    local createimage          = images.create
853
854    directives.register("graphics.pdf.recompress",  function(v) recompress  = v end)
855    directives.register("graphics.pdf.stripmarked", function(v) stripmarked = v end)
856
857    local function scaledbbox(b)
858        return { b[1]*factor, b[2]*factor, b[3]*factor, b[4]*factor }
859    end
860
861    local codecs = {
862        ASCIIHexDecode  = true,
863        ASCII85Decode   = true,
864        RunLengthDecode = true,
865        FlateDecode     = true,
866        LZWDecode       = true,
867    }
868
869    local function deepcopyobject(xref,copied,value)
870        -- no need for tables, just nested loop with obj
871        local objnum = xref[value]
872        if objnum then
873            local usednum = copied[objnum]
874            if usednum then
875             -- report("%s object %i is reused",kind,objnum)
876            else
877                usednum = pdfreserveobject()
878                copied[objnum] = usednum
879                local entry = value
880                local kind  = entry.__type__
881                if kind == array_object_code then
882                    local a = copyarray(xref,copied,entry)
883                    pdfflushobject(usednum,tostring(a))
884                elseif kind == dictionary_object_code then
885                    local d = copydictionary(xref,copied,entry)
886                    pdfflushobject(usednum,tostring(d))
887                elseif kind == stream_object_code then
888                    local d = copydictionary(xref,copied,entry)
889                    local filter = d.Filter
890                    if filter and codecs[filter] and recompress then
891                        -- recompress
892                        d.Filter      = nil
893                        d.Length      = nil
894                        d.DecodeParms = nil -- relates to filter
895                        d.DL          = nil -- needed?
896                        local s = entry()                        -- get uncompressed stream
897                        pdfflushstreamobject(s,d,true,usednum)   -- compress stream
898                    else
899                        -- keep as-is, even Length which indicates the
900                        -- decompressed length
901                        local s = entry(false)                        -- get compressed stream
902                     -- pdfflushstreamobject(s,d,false,usednum,true)  -- don't compress stream
903                        pdfflushstreamobject(s,d,"raw",usednum)       -- don't compress stream
904                    end
905                else
906                    local t = type(value)
907                    if t == "string" then
908                        value = pdfconstant(value)
909                    elseif t == "table" then
910                        local kind  = value[1]
911                        local entry = value[2]
912                        if kind == name_object_code then
913                            value = pdfconstant(entry)
914                        elseif kind == string_object_code then
915                            value = pdfliteral(entry,value[3])
916                        elseif kind == null_object_code then
917                            value = pdfnull()
918                        elseif kind == reference_object_code then
919                            value = deepcopyobject(xref,copied,entry)
920                        elseif entry == nil then
921                            value = pdfnull()
922                        else
923                            value = tostring(entry)
924                        end
925                    end
926                    pdfflushobject(usednum,value)
927                end
928            end
929            return pdfreference(usednum)
930        elseif kind == stream_object_code then
931            report("stream not done: %s", objectcodes[kind] or "?")
932        else
933            report("object not done: %s", objectcodes[kind] or "?")
934        end
935    end
936
937    local function copyobject(xref,copied,object,key,value)
938        if not value then
939            value = object.__raw__[key]
940        end
941        local t = type(value)
942        if t == "string" then
943            return pdfconstant(value)
944        elseif t ~= "table" then
945            return value
946        end
947        local kind = value[1]
948        if kind == name_object_code then
949            return pdfconstant(value[2])
950        elseif kind == string_object_code then
951            return pdfliteral(value[2],value[3])
952        elseif kind == array_object_code then
953            return copyarray(xref,copied,object[key])
954        elseif kind == dictionary_object_code then
955            return copydictionary(xref,copied,object[key])
956        elseif kind == null_object_code then
957            return pdfnull()
958        elseif kind == reference_object_code then
959            -- expand
960            return deepcopyobject(xref,copied,object[key])
961        else
962            report("weird: %s", objecttypes[kind] or "?")
963        end
964    end
965
966    copyarray = function (xref,copied,object)
967        local target = pdfarray()
968        local source = object.__raw__
969        for i=1,#source do
970            target[i] = copyobject(xref,copied,object,i,source[i])
971        end
972        return target
973    end
974
975    local plugins = nil
976
977    -- Sorting the hash slows down upto 5% bit but it is still as fast as the C
978    -- code. We could loop over the index instead but sorting might be nicer in
979    -- the end.
980
981    copydictionary = function (xref,copied,object)
982        local target = pdfdictionary()
983        local source = object.__raw__
984     -- for key, value in next, source do
985        for key, value in sortedhash(source) do
986            if plugins then
987                local p = plugins[key]
988                if p then
989                    target[key] = p(xref,copied,object,key,value,copyobject) -- maybe a table of methods
990                else
991                    target[key] = copyobject(xref,copied,object,key,value)
992                end
993            else
994                target[key] = copyobject(xref,copied,object,key,value)
995            end
996        end
997        return target
998    end
999
1000 -- local function copyresources(pdfdoc,xref,copied,pagedata)
1001 --     local Resources = pagedata.Resources
1002 --     if Resources then
1003 --         local r = pdfreserveobject()
1004 --         local d = copydictionary(xref,copied,Resources)
1005 --         pdfflushobject(r,tostring(d))
1006 --         return pdfreference(r)
1007 --     end
1008 -- end
1009
1010    local function copyresources(pdfdoc,xref,copied,pagedata)
1011        local Resources = pagedata.Resources
1012     --
1013     -- -- This needs testing:
1014     --
1015     -- if not Resources then
1016     --     local Parent = page.Parent
1017     --     while (Parent and (Parent.__type__ == dictionary_object_code or Parent.__type__ == reference_object_code) do
1018     --         Resources = Parent.Resources
1019     --         if Resources then
1020     --             break
1021     --         end
1022     --         Parent = Parent.Parent
1023     --     end
1024     -- end
1025        if Resources then
1026            local d = copydictionary(xref,copied,Resources)
1027            return shareobjectreference(d)
1028        end
1029    end
1030
1031    local openpdf  = lpdf_epdf.load
1032    local closepdf = lpdf_epdf.unload
1033
1034    -- todo: keep track of already open files
1035
1036    local function newpdf(str,userpassword,ownerpassword)
1037        return openpdf(str,userpassword,ownerpassword,true)
1038    end
1039
1040    local sizes = {
1041        crop  = "CropBox",
1042        media = "MediaBox",
1043        bleed = "BleedBox",
1044        art   = "ArtBox",
1045        trim  = "TrimBox",
1046    }
1047
1048    local function querypdf(pdfdoc,pagenumber,size)
1049        if pdfdoc then
1050            if not pagenumber then
1051                pagenumber = 1
1052            end
1053            local root = pdfdoc.Catalog
1054            local page = pdfdoc.pages[pagenumber]
1055            if page then
1056                local sizetag  = sizes[size or "crop"] or sizes.crop
1057                local mediabox = page.MediaBox or { 0, 0, 0, 0 }
1058                local cropbox  = page[sizetag] or mediabox
1059                return {
1060                    filename    = pdfdoc.filename,
1061                    pagenumber  = pagenumber,
1062                    nofpages    = pdfdoc.nofpages,
1063                    boundingbox = scaledbbox(cropbox),
1064                    cropbox     = cropbox,
1065                    mediabox    = mediabox,
1066                    bleedbox    = page.BleedBox or cropbox,
1067                    trimbox     = page.TrimBox or cropbox,
1068                    artbox      = page.ArtBox or cropbox,
1069                    rotation    = page.Rotate or 0,
1070                    xsize       = cropbox[3] - cropbox[1],
1071                    ysize       = cropbox[4] - cropbox[2],
1072                }
1073            end
1074        end
1075    end
1076
1077    local function copypage(pdfdoc,pagenumber,attributes,compact,width,height,attr)
1078        if pdfdoc then
1079            local root     = pdfdoc.Catalog
1080            local page     = pdfdoc.pages[pagenumber or 1]
1081            local pageinfo = querypdf(pdfdoc,pagenumber)
1082            local contents = page.Contents
1083            if contents then
1084                local xref     = pdfdoc.__xrefs__
1085                local copied   = pdfdoc.__copied__
1086                if compact and lpdf_epdf.plugin then
1087                    plugins = lpdf_epdf.plugin(pdfdoc,xref,copied,page)
1088                end
1089                local xobject = pdfdictionary {
1090                    Type           = pdfconstant("XObject"),
1091                    Subtype        = pdfconstant("Form"),
1092                    FormType       = 1,
1093                    Group          = copyobject(xref,copied,page,"Group"),
1094                    LastModified   = copyobject(xref,copied,page,"LastModified"),
1095                    Metadata       = copyobject(xref,copied,page,"Metadata"),
1096                    PieceInfo      = copyobject(xref,copied,page,"PieceInfo"),
1097                    Resources      = copyresources(pdfdoc,xref,copied,page),
1098                    SeparationInfo = copyobject(xref,copied,page,"SeparationInfo"),
1099                } + attr
1100                if attributes then
1101                    for k, v in expanded(attributes) do
1102                        page[k] = v -- maybe nested
1103                    end
1104                end
1105                local content  = ""
1106                local nolength = nil
1107                local ctype    = contents.__type__
1108                -- we always recompress because image object streams can not be
1109                -- influenced (yet)
1110                if ctype == stream_object_code then
1111                    if stripmarked then
1112                        content = contents() -- uncompressed
1113                        local stripped = lpdf_epdf.stripcontent(content)
1114                        if stripped ~= content then
1115                         -- report("%i bytes stripped on page %i",#content-#stripped,pagenumber or 1)
1116                            content = stripped
1117                        end
1118                    elseif recompress then
1119                        content = contents() -- uncompressed
1120                    else
1121                        local Filter = copyobject(xref,copied,contents,"Filter")
1122                        local Length = copyobject(xref,copied,contents,"Length")
1123                        if Length and Filter then
1124                            nolength = true
1125                            xobject.Length = Length
1126                            xobject.Filter = Filter
1127                            content = contents(false) -- uncompressed
1128                        else
1129                            content = contents() -- uncompressed
1130                        end
1131                    end
1132                elseif ctype == array_object_code then
1133                    content = { }
1134                    for i=1,#contents do
1135                        content[i] = contents[i]() -- uncompressed
1136                    end
1137                    content = concat(content," ")
1138                end
1139                -- still not nice: we double wrap now
1140                plugins = nil
1141                local rotation    = pageinfo.rotation
1142                local boundingbox = pageinfo.boundingbox
1143                local transform   = nil
1144                if rotation == 90 then
1145                    transform = 3
1146                elseif rotation == 180 then
1147                    transform = 2
1148                elseif rotation == 270 then
1149                    transform = 1
1150                elseif rotation > 1 and rotation < 4 then
1151                    transform = rotation
1152                end
1153                xobject.BBox = pdfarray {
1154                    boundingbox[1] * bpfactor,
1155                    boundingbox[2] * bpfactor,
1156                    boundingbox[3] * bpfactor,
1157                    boundingbox[4] * bpfactor,
1158                }
1159                -- maybe like bitmaps
1160                return createimage { -- beware: can be a img.new or a dummy
1161                    bbox      = boundingbox,
1162                    transform = transform,
1163                    nolength  = nolength,
1164                    nobbox    = true,
1165                    notype    = true,
1166                    stream    = content, -- todo: no compress, pass directly also length, filter etc
1167                    attr      = xobject(),
1168                    kind      = images.types.stream,
1169                }
1170            else
1171                -- maybe report an error
1172            end
1173        end
1174    end
1175
1176    lpdf_epdf.image = {
1177        open  = openpdf,
1178        close = closepdf,
1179        new   = newpdf,
1180        query = querypdf,
1181        copy  = copypage,
1182    }
1183
1184--     lpdf.injectors.pdf = function(specification)
1185--         local d = lpdf_epdf.load(specification.filename)
1186--         print(d)
1187--     end
1188
1189
1190end end
1191
1192-- local d = lpdf_epdf.load("e:/tmp/oeps.pdf")
1193-- inspect(d)
1194-- inspect(d.Catalog.Lang)
1195-- inspect(d.Catalog.OCProperties.D.AS[1].Event)
1196-- inspect(d.Catalog.Metadata())
1197-- inspect(d.Catalog.Pages.Kids[1])
1198-- inspect(d.layers)
1199-- inspect(d.pages)
1200-- inspect(d.destinations)
1201-- inspect(lpdf_epdf.getpagecontent(d,1))
1202-- inspect(lpdf_epdf.contenttotext(document,lpdf_epdf.getpagecontent(d,1)))
1203-- inspect(lpdf_epdf.getstructure(document,lpdf_epdf.getpagecontent(d,1)))
1204