1if not modules then modules = { } end modules ['lpdf-epd'] = {
2 version = 1.001,
3 comment = "companion to lpdf-epa.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files",
7 history = "this one replaces the poppler/pdfe binding",
8}
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47local setmetatable, type, next = setmetatable, type, next
48local tostring, tonumber, unpack = tostring, tonumber, unpack
49local char, byte, find = string.char, string.byte, string.find
50local abs = math.abs
51local concat, swapped, sortedhash, sortedkeys = table.concat, table.swapped, table.sortedhash, table.sortedkeys
52local utfchar = string.char
53local setmetatableindex = table.setmetatableindex
54local ioopen = io.open
55local octtointeger, dectointeger, hextointeger = string.octtointeger, string.dectointeger, string.hextointeger
56
57local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
58local P, C, S, R, Ct, Cc, V, Carg, Cs, Cf, Cg = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs, lpeg.Cf, lpeg.Cg
59
60if not lpdf then
61 require("lpdf-aux")
62end
63
64if not (number and number.dimenfactors) then
65 require("util-dim")
66end
67
68local pdfe = pdfe
69 lpdf = lpdf or { }
70local lpdf = lpdf
71local lpdf_epdf = { }
72 lpdf.epdf = lpdf_epdf
73
74local pdfopenfile = pdfe.openfile
75local pdfnew = pdfe.new
76local pdfclose = pdfe.close
77
78local getcatalog = pdfe.getcatalog
79local getpermissions = pdfe.getpermissions
80local getinfo = pdfe.getinfo
81local gettrailer = pdfe.gettrailer
82local getnofpages = pdfe.getnofpages
83local getversion = pdfe.getversion
84local getbox = pdfe.getbox
85local getstatus = pdfe.getstatus
86local unencrypt = pdfe.unencrypt
87local dictionarytotable = pdfe.dictionarytotable
88local arraytotable = pdfe.arraytotable
89local pagestotable = pdfe.pagestotable
90local readwholestream = pdfe.readwholestream
91local getfromreference = pdfe.getfromreference
92local getfromobject = pdfe.getfromobject
93local getobjectrange = pdfe.getobjectrange
94
95local report_epdf = logs.reporter("epdf")
96
97local allocate = utilities.storage.allocate
98
99local bpfactor <const> = number.dimenfactors.bp
100
101local objectcodes = { [0] =
102 "none",
103 "null",
104 "bool",
105 "integer",
106 "number",
107 "name",
108 "string",
109 "array",
110 "dictionary",
111 "stream",
112 "reference",
113 "lpdf",
114}
115
116local encryptioncodes = {
117 [0] = "notencrypted",
118 [1] = "unencrypted",
119 [-1] = "protected",
120 [-2] = "failure",
121}
122
123objectcodes = allocate(swapped(objectcodes,objectcodes))
124encryptioncodes = allocate(swapped(encryptioncodes,encryptioncodes))
125
126lpdf_epdf.objectcodes = objectcodes
127lpdf_epdf.encryptioncodes = encryptioncodes
128
129local none_object_code <const> = objectcodes.none
130local null_object_code <const> = objectcodes.null
131local bool_object_code <const> = objectcodes.bool
132local integer_object_code <const> = objectcodes.integer
133local number_object_code <const> = objectcodes.number
134local name_object_code <const> = objectcodes.name
135local string_object_code <const> = objectcodes.string
136local array_object_code <const> = objectcodes.array
137local dictionary_object_code <const> = objectcodes.dictionary
138local stream_object_code <const> = objectcodes.stream
139local reference_object_code <const> = objectcodes.reference
140local lpdf_object_code <const> = objectcodes.lpdf
141
142local recompress = false
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158local some_dictionary
159local some_array
160local some_stream
161local some_reference
162
163local some_string = lpdf.frombytes
164
165function lpdf_epdf.objecttype(object)
166 if type(object) == "table" then
167 local kind = object.__type__
168 return kind and objectcodes[kind]
169 end
170end
171
172local function get_value(document,t,key)
173 if not key or not t then
174 return
175 end
176 local value = t[key]
177 if not value then
178 return
179 end
180 if type(value) ~= "table" then
181 return value
182 end
183 local kind = value[1]
184 if kind == name_object_code then
185 return value[2]
186 elseif kind == string_object_code then
187 return some_string(value[2],value[3])
188 elseif kind == array_object_code then
189 return some_array(value[2],document)
190 elseif kind == dictionary_object_code then
191 return some_dictionary(value[2],document)
192 elseif kind == stream_object_code then
193 return some_stream(value,value[2],document)
194 elseif kind == reference_object_code then
195 return some_reference(value,document)
196 end
197 return value
198end
199
200local checked_access
201local get_flagged
202
203if lpdf.dictionary then
204
205
206
207 local pdfdictionary = lpdf.dictionary
208 local pdfarray = lpdf.array
209 local pdfconstant = lpdf.constant
210 local pdfreference = lpdf.reference
211 local pdfliteral = lpdf.literal
212
213 local copyarray = nil
214 local copydictionary = nil
215
216 local function copyobject(document,object,key,value)
217 if not value then
218 value = object.__raw__[key]
219 end
220 local t = type(value)
221 if t == "string" then
222 return pdfconstant(value)
223 elseif t ~= "table" then
224 return value
225 end
226 local kind = value[1]
227 if kind == name_object_code then
228 return pdfconstant(value[2])
229 elseif kind == string_object_code then
230 return pdfliteral(value[2],false)
231 elseif kind == array_object_code then
232 return copyarray(document,object[key])
233 elseif kind == dictionary_object_code then
234 return copydictionary(document,object[key])
235 elseif kind == null_object_code then
236 return pdfnull()
237 elseif kind == reference_object_code then
238 return pdfreference(value[3])
239 else
240
241 end
242 end
243
244 copyarray = function(document,object)
245 local target = pdfarray()
246 local source = object.__raw__
247 for i=1,#source do
248 target[i] = copyobject(document,object,i,source[i])
249 end
250 return target
251 end
252
253 copydictionary = function(document,object)
254 local target = pdfdictionary()
255 local source = object.__raw__
256 for key, value in sortedhash(source) do
257 target[key] = copyobject(document,object,key,value)
258 end
259 return target
260 end
261
262 get_flagged = function(document,t,f)
263 local kind = t.__type__
264 if kind == name_object_code then
265 return pdfconstant(f)
266 elseif kind == array_object_code then
267 return copyarray(document,t)
268 elseif kind == dictionary_object_code then
269 return copydictionary(document,t)
270 elseif kind == stream_object_code then
271 return copydictionary(document,t)
272 elseif kind == string_object_code then
273 return pdfunicode(f)
274 elseif kind == null_object_code then
275 return pdfnull()
276 elseif kind == reference_object_code then
277 return pdfreference(t[3])
278 else
279 return f
280 end
281 end
282
283 function lpdf_epdf.verboseobject(document,n)
284 if document and n then
285 local object = document.objects[n]
286 if object then
287 local t = { n .. " 0 obj" }
288 if lpdf.epdf.objecttype(object) == "stream" then
289 t[#t+1] = object("dictionary")()
290 t[#t+1] = "stream"
291 t[#t+1] = tostring(object(true))
292 t[#t+1] = "endstream"
293 else
294 t[#t+1] = tostring(object())
295 end
296 t[#t+1] = "endobj"
297 return concat(t,"\n")
298 end
299 end
300 end
301
302else
303
304 get_flagged = function(document,t,f)
305 return t[k]
306 end
307
308end
309
310some_dictionary = function(d,document)
311 local f = dictionarytotable(d,true)
312 local t = setmetatable({ __raw__ = f, __type__ = dictionary_object_code }, {
313 __index = function(t,k)
314 return get_value(document,f,k)
315 end,
316 __call = function(t)
317 return get_flagged(document,t,f)
318 end,
319 } )
320 return t, "dictionary"
321end
322
323some_array = function(a,document)
324 local f = arraytotable(a,true)
325 local n = #f
326 local t = setmetatable({ __raw__ = f, __type__ = array_object_code, n = n }, {
327 __index = function(t,k)
328 return get_value(document,f,k)
329 end,
330 __call = function(t)
331 return get_flagged(document,t,f)
332 end,
333 __len = function(t,k)
334 return n
335 end,
336 } )
337 return t, "array"
338end
339
340some_stream = function(s,d,document)
341 local f = dictionarytotable(d,true)
342 local t = setmetatable({ __raw__ = f, __type__ = stream_object_code }, {
343 __index = function(t,k)
344 return get_value(document,f,k)
345 end,
346 __call = function(t,how)
347 if how == "dictionary" then
348 return get_flagged(document,t,f)
349
350 elseif how == false then
351
352
353
354
355 return readwholestream(s,false)
356 else
357
358
359
360 return readwholestream(s,true)
361 end
362 end,
363 } )
364 return t, "stream"
365end
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382some_reference = function(r,document)
383 local objnum = r[3]
384 local cached = document.__cache__[objnum]
385 if not cached then
386 local kind, object, b, c = getfromreference(r[2])
387 if kind == dictionary_object_code then
388 cached = some_dictionary(object,document)
389 elseif kind == array_object_code then
390 cached = some_array(object,document)
391 elseif kind == stream_object_code then
392 cached = some_stream(object,b,document)
393
394
395 else
396
397 cached = { kind, object, b, c }
398 end
399 document.__cache__[objnum] = cached
400 document.__xrefs__[cached] = objnum
401 end
402 return cached
403end
404
405local function some_object(document,n)
406 local kind, object, b, c = getfromobject(document.__data__,n)
407 if kind == dictionary_object_code then
408 return some_dictionary(object,document)
409 elseif kind == array_object_code then
410 return some_array(object,document)
411 elseif kind == stream_object_code then
412 return some_stream(object,b,document)
413 else
414
415 return { kind, object, b, c }
416 end
417end
418
419local resolvers = { }
420lpdf_epdf.resolvers = resolvers
421
422local function resolve(document,k)
423 local resolver = resolvers[k]
424 if resolver then
425 local entry = resolver(document)
426 document[k] = entry
427 return entry
428 end
429end
430
431
432
433
434
435
436
437
438
439local function getnames(document,n,target)
440 if n then
441 local Names = n.Names
442 if Names then
443 if not target then
444 target = { }
445 end
446 local duplicates = { }
447 for i=1,#Names,2 do
448 local name = lpdf.fromeight(Names[i])
449
450 if target[name] then
451 local d = duplicates[name] or 0
452 d = d + 1
453 duplicates[name] = d
454 name = name .. " [duplicate." .. d .. "]"
455 end
456 target[name] = Names[i+1]
457 end
458 local Limits = n.Limits
459 if Limits then
460 local first = Limits[1]
461 local last = Limits[2]
462 local fname = Names[1]
463 local lname = Names[#Names-1]
464 if fname ~= first or lname ~= last then
465 report_epdf()
466 report_epdf("check file %a, first value %a, first limit %a, last value %a, last limit %a",document.filename,fname,first,lname,last)
467 report_epdf()
468 end
469 end
470 else
471 local Kids = n.Kids
472 if Kids then
473 for i=1,#Kids do
474 target = getnames(document,Kids[i],target)
475 end
476 end
477 end
478 return target
479 end
480end
481
482local function getkids(document,n,target)
483 if n then
484 local Kids = n.Kids
485 if Kids then
486 for i=1,#Kids do
487 target = getkids(document,Kids[i],target)
488 end
489 elseif target then
490 target[#target+1] = n
491 else
492 target = { n }
493 end
494 return target
495 end
496end
497
498function resolvers.destinations(document)
499 local Names = document.Catalog.Names
500 return getnames(document,Names and Names.Dests)
501end
502
503function resolvers.javascripts(document)
504 local Names = document.Catalog.Names
505 return getnames(document,Names and Names.JavaScript)
506end
507
508function resolvers.widgets(document)
509 local Names = document.Catalog.AcroForm
510 return Names and Names.Fields
511end
512
513function resolvers.embeddedfiles(document)
514 local Names = document.Catalog.Names
515 return getnames(document,Names and Names.EmbeddedFiles)
516end
517
518
519
520
521
522
523
524
525
526
527function resolvers.layers(document)
528 local properties = document.Catalog.OCProperties
529 if properties then
530 local layers = properties.OCGs
531 if layers then
532 local t = { }
533 for i=1,#layers do
534 local layer = layers[i]
535 t[i] = layer.Name
536 end
537
538 return t
539 end
540 end
541end
542
543function resolvers.structure(document)
544
545 return document.Catalog.StructTreeRoot
546end
547
548function resolvers.pages(document)
549 local __data__ = document.__data__
550 local __xrefs__ = document.__xrefs__
551 local __cache__ = document.__cache__
552
553 local nofpages = document.nofpages
554 local pages = { }
555 local rawpages = pagestotable(__data__)
556 document.pages = pages
557
558 for pagenumber=1,nofpages do
559 local rawpagedata = rawpages[pagenumber]
560 if rawpagedata then
561 local pagereference = rawpagedata[3]
562 local pageobject = rawpagedata[1]
563 local pagedata = some_dictionary(pageobject,document)
564 if pagedata and pageobject then
565 pagedata.number = pagenumber
566 pagedata.MediaBox = getbox(pageobject,"MediaBox")
567 pagedata.CropBox = getbox(pageobject,"CropBox")
568 pagedata.BleedBox = getbox(pageobject,"BleedBox")
569 pagedata.ArtBox = getbox(pageobject,"ArtBox")
570 pagedata.TrimBox = getbox(pageobject,"TrimBox")
571 pages[pagenumber] = pagedata
572 __xrefs__[pagedata] = pagereference
573 __cache__[pagereference] = pagedata
574 else
575 report_epdf("missing pagedata for page %i, case %i",pagenumber,1)
576 end
577 else
578 report_epdf("missing pagedata for page %i, case %i",pagenumber,2)
579 end
580 end
581
582
583
584 return pages
585end
586
587local loaded = { }
588local nofloaded = 0
589
590function lpdf_epdf.load(filename,userpassword,ownerpassword,fromstring)
591 local document = loaded[filename]
592 if not document then
593 statistics.starttiming(lpdf_epdf)
594 local __data__
595 local __file__
596 if fromstring then
597 __data__ = pdfnew(filename,#filename)
598 else
599 local f = ioopen(filename,"rb")
600 __data__ = f and pdfopenfile(f)
601 end
602 if __data__ then
603 if userpassword and getstatus(__data__) < 0 then
604 unencrypt(__data__,userpassword,nil)
605 end
606 if ownerpassword and getstatus(__data__) < 0 then
607 unencrypt(__data__,nil,ownerpassword)
608 end
609 if getstatus(__data__) < 0 then
610 report_epdf("the document is encrypted, provide proper passwords")
611 __data__ = false
612 end
613 if __data__ then
614 local __cache__ = { }
615 local __xrefs__ = { }
616 document = {
617 filename = filename,
618 nofcopied = 0,
619 copied = { },
620 __cache__ = __cache__,
621 __xrefs__ = __xrefs__,
622 __fonts__ = { },
623 __copied__ = { },
624 __data__ = __data__,
625 }
626 document.Catalog = some_dictionary(getcatalog(__data__),document)
627 document.Info = some_dictionary(getinfo(__data__),document)
628 document.Trailer = some_dictionary(gettrailer(__data__),document)
629 document.catalog = document.Catalog
630 document.info = document.Info
631 document.trailer = document.Trailer
632
633 document.encrypted = document.Trailer.Encrypt and true or false
634 local permissions = getpermissions(__data__)
635 document.permissions = permissions and permissions >= 0 and lpdf.topermissions(permissions) or nil
636
637 setmetatableindex(document,resolve)
638
639 document.majorversion, document.minorversion = getversion(__data__)
640
641 document.nofpages = getnofpages(__data__)
642
643
644 document.objects = setmetatableindex(function(t,objnum)
645 local kind = type(objnum)
646 if kind == "table" and objnum[1] == reference_object_code then
647 objnum = objnum[3]
648 kind = type(objnum)
649 end
650 if kind == "number" then
651 local cached = __cache__[objnum]
652 if not cached then
653 cached = some_object(document,objnum)
654 __cache__[objnum] = cached
655 __xrefs__[cached] = objnum
656 end
657 return cached
658 end
659 end)
660 else
661 document = false
662 end
663 else
664 if not __data_ then
665 report_epdf("the document is damaged or empty")
666 end
667 document = false
668 end
669 loaded[filename] = document
670 loaded[document] = document
671 statistics.stoptiming(lpdf_epdf)
672
673 end
674 if document then
675 nofloaded = nofloaded + 1
676 end
677 return document or nil
678end
679
680function lpdf_epdf.objectrange(filename,n)
681 local document = loaded[filename]
682 if document then
683 return getobjectrange(document.__data__,n)
684 end
685end
686
687function lpdf_epdf.unload(filename)
688 if type(filename) == "table" then
689 filename = filename.filename
690 end
691 if type(filename) == "string" then
692 local document = loaded[filename]
693 if document then
694 loaded[document] = nil
695 loaded[filename] = nil
696 pdfclose(document.__data__)
697 end
698 end
699end
700
701function lpdf.close(document)
702 if loaded[document] then
703 loaded[document] = nil
704 loaded[document.filename] = nil
705 pdfclose(document.__data__)
706 end
707end
708
709
710
711local function expanded(t)
712 local function iterator(raw,k)
713 local k, v = next(raw,k)
714 if v then
715 return k, t[k]
716 end
717 end
718 return iterator, t.__raw__, nil
719end
720
721
722lpdf_epdf.expanded = expanded
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743local h_hex_2 = lpdf.h_hex_2
744
745local remapper = {
746 ["\\("] = h_hex_2["("],
747 ["\\)"] = h_hex_2[")"],
748 ["\\n"] = h_hex_2["\n"],
749 ["\\r"] = h_hex_2["\r"],
750 ["\\t"] = h_hex_2["\t"],
751 ["\\b"] = h_hex_2["\b"],
752 ["\\f"] = h_hex_2["\f"],
753 ["\\\n"] = "",
754 ["\\\r"] = "",
755 ["\\\\"] = h_hex_2["\\"],
756}
757
758local p_remap = lpeg.utfchartabletopattern(remapper)
759
760setmetatableindex(remapper,function(t,k)
761 local v = h_hex_2[char(octtointeger(k))]
762 t[k] = v
763 return v
764end)
765
766local p_hex_string = Ct(Cc("hex") *
767 P("<")
768 * Cs((1 - P(">"))^0)
769 * P(">")
770)
771
772local p_dec_string = Ct(Cc("hex") *
773 P("(")
774 * Cs(
775 (
776 p_remap / remapper
777 + P("\\")/"" * ((lpegpatterns.octdigit * lpegpatterns.octdigit^-2) / remapper)
778 + P(1) / h_hex_2
779 - P(")")
780 )^0
781 )
782 * P(")")
783)
784
785local spaces = lpegpatterns.whitespace^1
786local optspaces = lpegpatterns.whitespace^0
787local comment = P("%") * (1 - lpegpatterns.newline)^0
788
789local numchar = P("\\")/"" * (R("09")^3/function(s) return char(octtointeger(s)) end)
790 + P("\\") * P(1)
791
792
793local key = P("/") * C((1 - lpegpatterns.whitespace - S("<>/[]()"))^1)
794
795local number = Ct(Cc("number") * (lpegpatterns.number/dectointeger))
796local keyword = Ct(Cc("name") * key)
797local operator = C((R("AZ","az")+P("*")+P("'")+P('"'))^1)
798
799local grammar = P { "start",
800 start = (comment + keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces)^1,
801 keyvalue = optspaces * keyword * optspaces * V("start"),
802 dictionary = Ct(Cc("dict") * P("<<") * Ct(V("keyvalue")^0) * P(">>")),
803
804 array = Ct(Cc("array") * P("[") * Ct(V("start")^0) * P("]")),
805 hexstring = p_hex_string,
806 decstring = p_dec_string,
807}
808
809local operation = Ct(grammar^1 * operator + operator)
810local parser = Ct((operation + P(1))^1)
811
812local number = C(lpegpatterns.number)
813
814local fastgrammar = P { "start",
815 start = (comment + keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces)^1,
816 keyvalue = optspaces * keyword * optspaces * V("start"),
817 dictionary = Ct(Cc("dict") * P("<<") * Ct(V("keyvalue")^0) * P(">>")),
818
819 array = Ct(Cc("array") * P("[") * Ct(V("start")^0) * P("]")),
820 hexstring = p_hex_string,
821 decstring = p_dec_string,
822}
823
824local fastoperation = Ct(fastgrammar^1 * operator + operator)
825local fastparser = Ct((fastoperation + P(1))^1)
826
827function lpdf_epdf.parsecontent(str,fast)
828 return lpegmatch(fast and fastparser or parser,str)
829end
830
831
832
833
834
835
836
837local fromsixteen = lpdf.fromsixteen
838
839local function f_bfchar(t,a,b)
840 t[hextointeger(a)] = fromsixteen(b)
841end
842
843local function f_bfrange_1(t,a,b,c)
844 print("todo 1",a,b,c)
845
846
847end
848
849local function f_bfrange_2(t,a,b,c)
850 print("todo 2",a,b,c)
851
852
853end
854
855local optionals = spaces^0
856local hexstring = optionals * P("<") * C((1-P(">"))^1) * P(">")
857local bfchar = Carg(1) * hexstring * hexstring / f_bfchar
858local bfrange = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1
859 + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2
860local fromunicode = (
861 P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) +
862 P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") +
863 spaces +
864 P(1)
865)^1 * Carg(1)
866
867lpdf_epdf.helpers = { }
868
869function lpdf_epdf.helpers.tounicodetable(tounicode)
870 return tounicode and lpegmatch(fromunicode,tounicode,1,{})
871end
872
873
874
875
876
877local function analyzefonts(document,resources)
878 local fonts = document.__fonts__
879 if resources then
880 local fontlist = resources.Font
881 if fontlist then
882 for id, data in expanded(fontlist) do
883 if not fonts[id] then
884
885
886 local tounicode = data.ToUnicode()
887 if tounicode then
888 tounicode = lpegmatch(fromunicode,tounicode,1,{})
889 end
890 fonts[id] = {
891 tounicode = type(tounicode) == "table" and tounicode or { }
892 }
893 setmetatableindex(fonts[id],"self")
894 end
895 end
896 end
897 end
898 return fonts
899end
900
901lpdf_epdf.analyzefonts = analyzefonts
902
903local more = 0
904local unic = nil
905
906local p_hex_to_utf = C(4) / function(s)
907 local now = hextointeger(s)
908 if more > 0 then
909 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
910 more = 0
911 return unic[now] or utfchar(now)
912 elseif now >= 0xD800 and now <= 0xDBFF then
913 more = now
914
915 else
916 return unic[now] or utfchar(now)
917 end
918end
919
920local p_dec_to_utf = C(1) / function(s)
921 local now = byte(s)
922 return unic[now] or utfchar(now)
923end
924
925local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1)
926local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1)
927
928
929
930local function allcontent(content)
931 if type(content) == "table" then
932 local ctype = content.__type__
933 if ctype == stream_object_code then
934 content = content()
935 elseif ctype == array_object_code then
936 local c = { }
937 for i=1,#content do
938 c[i] = content[i]()
939 end
940 content = concat(c," ")
941 end
942 end
943 return content
944end
945
946lpdf_epdf.allcontent = allcontent
947
948function lpdf_epdf.getpagecontent(document,pagenumber,asis,fast)
949
950 local page = document.pages[pagenumber]
951
952 if not page then
953 return
954 end
955
956 local content = allcontent(page.Contents or "")
957 local list = lpegmatch(fast and fastparser or parser,content)
958
959 if asis then
960 return list
961 end
962
963 local fonts = analyzefonts(document,page.Resources)
964 local font = nil
965
966
967 for i=1,#list do
968 local entry = list[i]
969 local size = #entry
970 local operator = entry[size]
971 if operator == "Tf" then
972 font = fonts[entry[1][2]]
973 unic = font and font.tounicode or { }
974 elseif operator == "TJ" then
975 local data = entry[1]
976 local list = data[2]
977
978 for i=1,#list do
979 local li = list[i]
980
981 local kind = li[1]
982 if kind == "hex" then
983 list[i] = lpegmatch(p_hex_to_utf,li[2])
984 elseif kind == "string" then
985 list[i] = lpegmatch(p_dec_to_utf,li[2])
986 else
987 list[i] = li[2]
988 end
989
990
991
992 end
993 elseif operator == "Tj" or operator == "'" or operator == '"' then
994
995 local data = entry[size-1]
996 local list = data[2]
997 local kind = list[1]
998 if kind == "hex" then
999 list[2] = lpegmatch(p_hex_to_utf,li[2])
1000 elseif kind == "string" then
1001 list[2] = lpegmatch(p_dec_to_utf,li[2])
1002 end
1003 end
1004 end
1005
1006 unic = nil
1007
1008 return list
1009
1010end
1011
1012
1013
1014
1015local softhyphen = utfchar(0xAD) .. "$"
1016local linefactor = 1.3
1017
1018function lpdf_epdf.contenttotext(document,list)
1019 local last_y = 0
1020 local last_f = 0
1021 local text = { }
1022 local last = 0
1023
1024 for i=1,#list do
1025 local entry = list[i]
1026 local size = #entry
1027 local operator = entry[size]
1028 if operator == "Tf" then
1029 last_f = entry[2][2]
1030 elseif operator == "TJ" then
1031 local data = entry[1]
1032 local list = data[2]
1033 for i=1,#list do
1034 local li = list[i]
1035 local kind = type(li)
1036 if kind == "string" then
1037 last = last + 1
1038 text[last] = li
1039 elseif kind == "number" and li < -50 then
1040 last = last + 1
1041 text[last] = " "
1042 end
1043 end
1044 elseif operator == "Tj" then
1045 last = last + 1
1046 local li = entry[size-1]
1047 local kind = type(li)
1048 if kind == "string" then
1049 last = last + 1
1050 text[last] = li
1051 end
1052 elseif operator == "cm" or operator == "Tm" then
1053 local data = entry
1054 local ty = entry[6][2]
1055 local dy = abs(last_y - ty)
1056 if dy > linefactor*last_f then
1057 if last > 0 then
1058 if find(text[last],softhyphen,1,true) then
1059
1060 else
1061 last = last + 1
1062 text[last] = "\n"
1063 end
1064 end
1065 end
1066 last_y = ty
1067 end
1068 end
1069
1070 return concat(text)
1071end
1072
1073function lpdf_epdf.contenttostring(contents)
1074 local r = 0
1075 local result = { }
1076 local compact = false
1077 local rr = false
1078
1079 local flatten ; flatten = function(t)
1080 local nt = #t
1081 compact = t[nt] == "TJ"
1082 for i=1,nt do
1083 local ti = t[i]
1084 if type(ti) == "table" then
1085 local t1 = ti[1]
1086 local t2 = ti[2]
1087 if t1 == "array" then
1088 if compact then
1089 local sr, sresult = r, result
1090 r, result = 1, { "[" }
1091 flatten(t2)
1092 r = r + 1 ; result[r] = "]"
1093 sr = sr + 1; sresult[sr] = concat(result)
1094 r, result = sr, sresult
1095 else
1096 r = r + 1 ; result[r] = "["
1097 flatten(t2)
1098 r = r + 1 ; result[r] = "]"
1099 end
1100 elseif t1 == "dict" then
1101 r = r + 1 ; result[r] = "<<"
1102 flatten(t2)
1103 r = r + 1 ; result[r] = ">>"
1104 elseif t1 == "hex" then
1105 r = r + 1 ; result[r] = "<" .. t2 .. ">"
1106 elseif t1 == "dec" then
1107
1108 r = r + 1 ; result[r] = lpdf.toeight(t2)
1109 elseif type(t2) == "number" then
1110 r = r + 1 ; result[r] = t2
1111 else
1112 r = r + 1 ; result[r] = "/" .. t2
1113 end
1114 else
1115 r = r + 1 ; result[r] = ti
1116 end
1117 end
1118 end
1119
1120 if true then
1121 for i=1,#contents do
1122 local c = contents[i]
1123 if #c > 0 then
1124 flatten(c)
1125 r = r + 1 ; result[r] = "\n"
1126 end
1127 end
1128 result[#result] = nil
1129 result = concat(result," ")
1130 result = string.gsub(result,"\n ","\n")
1131 return result
1132 else
1133 for i=1,#contents do
1134 flatten(contents[i])
1135 end
1136 return concat(result," ")
1137 end
1138end
1139
1140function lpdf_epdf.getstructure(document,list)
1141 local depth = 0
1142 for i=1,#list do
1143 local entry = list[i]
1144 local size = #entry
1145 local operator = entry[size]
1146 if operator == "BDC" then
1147 report_epdf("%w%s : %s",depth,entry[1] or "?",entry[2] and entry[2].MCID or "?")
1148 depth = depth + 1
1149 elseif operator == "EMC" then
1150 depth = depth - 1
1151 elseif operator == "TJ" then
1152 local list = entry[1]
1153 for i=1,#list do
1154 local li = list[i]
1155 if type(li) == "string" then
1156 report_epdf("%w > %s",depth,li)
1157 elseif li < -50 then
1158 report_epdf("%w >",depth,li)
1159 end
1160 end
1161 elseif operator == "Tj" then
1162 report_epdf("%w > %s",depth,entry[size-1])
1163 end
1164 end
1165end
1166
1167if images then do
1168
1169
1170
1171
1172 local copydictionary = nil
1173 local copyarray = nil
1174
1175 local pdfreference = lpdf.reference
1176 local pdfconstant = lpdf.constant
1177 local pdfarray = lpdf.array
1178 local pdfdictionary = lpdf.dictionary
1179 local pdfnull = lpdf.null
1180 local pdfliteral = lpdf.literal
1181
1182 local pdfreserveobject = lpdf.reserveobject
1183 local shareobjectreference = lpdf.shareobjectreference
1184 local pdfflushobject = lpdf.flushobject
1185 local pdfflushstreamobject = lpdf.flushstreamobject
1186
1187 local report = logs.reporter("backend","xobjects")
1188
1189 local factor = 65536 / (7200/7227)
1190
1191 local createimage = images.create
1192
1193 directives.register("graphics.pdf.recompress", function(v) recompress = v end)
1194
1195 local function scaledbbox(b)
1196 return { b[1]*factor, b[2]*factor, b[3]*factor, b[4]*factor }
1197 end
1198
1199 local codecs = {
1200 ["/ASCIIHexDecode"] = true,
1201 ["/ASCII85Decode"] = true,
1202 ["/RunLengthDecode"] = true,
1203 ["/FlateDecode"] = true,
1204 ["/LZWDecode"] = true,
1205 }
1206
1207 local function deepcopyobject(pdfdoc,xref,copied,value)
1208
1209 local objnum = xref[value]
1210 if objnum then
1211 local usednum = copied[objnum]
1212 if usednum then
1213
1214 else
1215 usednum = pdfreserveobject()
1216 copied[objnum] = usednum
1217 local entry = value
1218 local kind = entry.__type__
1219 if kind == array_object_code then
1220 local a = copyarray(pdfdoc,xref,copied,entry)
1221 pdfflushobject(usednum,tostring(a))
1222 elseif kind == dictionary_object_code then
1223 local d = copydictionary(pdfdoc,xref,copied,entry)
1224 pdfflushobject(usednum,tostring(d))
1225 elseif kind == stream_object_code then
1226 local d = copydictionary(pdfdoc,xref,copied,entry)
1227 local filter = d.Filter
1228
1229 if filter and codecs[tostring(filter)] and recompress then
1230
1231 d.Filter = nil
1232 d.Length = nil
1233 d.DecodeParms = nil
1234 d.DL = nil
1235 local s = entry()
1236 pdfflushstreamobject(s,d,true,usednum)
1237 else
1238
1239 local s = entry(false)
1240
1241 pdfflushstreamobject(s,d,"raw",usednum)
1242 end
1243 else
1244 local t = type(value)
1245 if t == "table" then
1246 local kind = value[1]
1247 local entry = value[2]
1248 if kind == name_object_code then
1249 value = pdfconstant(entry)
1250 elseif kind == string_object_code then
1251 value = pdfliteral(entry,false)
1252 elseif kind == null_object_code then
1253 value = pdfnull()
1254 elseif kind == reference_object_code then
1255 value = deepcopyobject(pdfdoc,xref,copied,entry)
1256 elseif entry == nil then
1257 value = pdfnull()
1258 else
1259 value = tostring(entry)
1260 end
1261
1262
1263
1264 end
1265 pdfflushobject(usednum,value)
1266 end
1267 end
1268 return pdfreference(usednum)
1269 elseif kind == stream_object_code then
1270 report("stream not done: %s", objectcodes[kind] or "?")
1271 else
1272 report("object not done: %s", objectcodes[kind] or "?")
1273 end
1274 end
1275
1276 local function copyobject(pdfdoc,xref,copied,object,key,value)
1277 if not value then
1278 value = object.__raw__[key]
1279 end
1280 local t = type(value)
1281 if t == "string" then
1282 return pdfconstant(value)
1283 elseif t ~= "table" then
1284 return value
1285 end
1286 local kind = value[1]
1287 if kind == name_object_code then
1288 return pdfconstant(value[2])
1289 elseif kind == string_object_code then
1290 return pdfliteral(value[2],false)
1291 elseif kind == array_object_code then
1292 return copyarray(pdfdoc,xref,copied,object[key])
1293 elseif kind == dictionary_object_code then
1294 return copydictionary(pdfdoc,xref,copied,object[key])
1295 elseif kind == null_object_code then
1296 return pdfnull()
1297 elseif kind == reference_object_code then
1298 return deepcopyobject(pdfdoc,xref,copied,object[key])
1299 elseif kind == lpdf_object_code then
1300 return value[2]
1301 else
1302
1303 end
1304 end
1305
1306 copyarray = function(pdfdoc,xref,copied,object)
1307 local target = pdfarray()
1308 local source = object.__raw__
1309 for i=1,#source do
1310 target[i] = copyobject(pdfdoc,xref,copied,object,i,source[i])
1311 end
1312 return target
1313 end
1314
1315 copydictionary = function (pdfdoc,xref,copied,object)
1316 local target = pdfdictionary()
1317 local source = object.__raw__
1318
1319 for key, value in sortedhash(source) do
1320 target[key] = copyobject(pdfdoc,xref,copied,object,key,value)
1321 end
1322 return target
1323 end
1324
1325 local openpdf = lpdf_epdf.load
1326 local closepdf = lpdf_epdf.unload
1327
1328
1329
1330 local function newpdf(str,userpassword,ownerpassword)
1331 return openpdf(str,userpassword,ownerpassword,true)
1332 end
1333
1334 local sizes = {
1335 crop = "CropBox",
1336 media = "MediaBox",
1337 bleed = "BleedBox",
1338 art = "ArtBox",
1339 trim = "TrimBox",
1340 }
1341
1342 local function querypdf(pdfdoc,pagenumber,size,pagelabel)
1343 if pdfdoc then
1344 local root = pdfdoc.Catalog
1345 if type(pagelabel) == "string" and pagelabel ~= "" then
1346 local pagedata = root.LMTX_Pages
1347 if pagedata then
1348 local labels = pagedata.Labels
1349 if labels then
1350 local found = labels[pagelabel]
1351 if found then
1352 pagenumber = tonumber(type(found) == "table" and found[1] or found)
1353 end
1354 end
1355 end
1356 end
1357 if not pagenumber then
1358 pagenumber = 1
1359 end
1360 local page = pdfdoc.pages[pagenumber]
1361 if page then
1362 local sizetag = sizes[size or "crop"] or sizes.crop
1363 local mediabox = page.MediaBox or { 0, 0, 0, 0 }
1364 local cropbox = page[sizetag] or mediabox
1365 local filename = pdfdoc.filename
1366 if cropbox[4] then
1367 return {
1368 filename = filename,
1369 pagenumber = pagenumber,
1370 nofpages = pdfdoc.nofpages,
1371 boundingbox = scaledbbox(cropbox),
1372 cropbox = cropbox,
1373 mediabox = mediabox,
1374 bleedbox = page.BleedBox or cropbox,
1375 trimbox = page.TrimBox or cropbox,
1376 artbox = page.ArtBox or cropbox,
1377 rotation = page.Rotate or 0,
1378 xsize = cropbox[3] - cropbox[1],
1379 ysize = cropbox[4] - cropbox[2],
1380 }
1381 else
1382 report("bad page %i in file %a",pagenumber,filename or "?")
1383 end
1384 end
1385 end
1386 end
1387
1388 local function copyresources(pdfdoc,xref,copied,Resources)
1389 if Resources then
1390 local d = copydictionary(pdfdoc,xref,copied,Resources)
1391 return shareobjectreference(d)
1392 else
1393 return lpdf.checkedresources()
1394 end
1395 end
1396
1397 local variables = interfaces.variables
1398
1399 local function getinclusion(pdfdoc)
1400 local inclusion = pdfdoc.lmtxinclusion
1401 if not inclusion then
1402 local catalog = pdfdoc.Catalog
1403 local info = pdfdoc.Info
1404 local metadata = nil
1405 local function checked(tag,pattern)
1406 local str = info[tag]
1407 if not str then
1408 if metadata == nil then
1409 metadata = catalog.Metadata or false
1410 if metadata then
1411 metadata = metadata() or false
1412 end
1413 if metadata then
1414 metadata = xml.convert(metadata)
1415 end
1416 end
1417 if metadata then
1418 str = xml.text(metadata,pattern)
1419 end
1420 end
1421 return str and str ~= "" and str or "unknown"
1422 end
1423
1424
1425 inclusion = pdfreference(pdfflushobject(pdfdictionary {
1426
1427
1428 Creator = checked("Creator", "Description/CreatorTool"),
1429 Producer = checked("Producer", "Description/Producer"),
1430
1431
1432 }))
1433 pdfdoc.lmtxinclusion = inclusion
1434 end
1435 return inclusion
1436 end
1437
1438 local function copypage(pdfdoc,pagenumber,attributes,compact,width,height,attr,copymeta)
1439 if pdfdoc then
1440 local root = pdfdoc.Catalog
1441 local page = pdfdoc.pages[pagenumber or 1]
1442 local pageinfo = querypdf(pdfdoc,pagenumber)
1443 if pageinfo then
1444 local contents = page.Contents
1445 local xref = pdfdoc.__xrefs__
1446 local copied = pdfdoc.__copied__
1447 local resources = page.Resources
1448 if compact and resources and lpdf_epdf.pageplugin then
1449 lpdf_epdf.pageplugin(pdfdoc,page,pagenumber,resources,compact)
1450 contents = page.Contents
1451 end
1452 local metadata = nil
1453
1454
1455
1456 if copymeta == variables.page or copymeta == variables.yes then
1457
1458 metadata = copyobject(pdfdoc,xref,copied,page,"Metadata")
1459 end
1460 if not metadata and (copymeta == variables.document or copymeta == variables.yes) then
1461
1462 metadata = copyobject(pdfdoc,xref,copied,root,"Metadata")
1463 end
1464 local xobject = pdfdictionary {
1465 Type = pdfconstant("XObject"),
1466 Subtype = pdfconstant("Form"),
1467 FormType = 1,
1468 Group = copyobject(pdfdoc,xref,copied,page,"Group"),
1469 LastModified = copyobject(pdfdoc,xref,copied,page,"LastModified"),
1470
1471 Metadata = metadata,
1472
1473 Resources = copyresources(pdfdoc,xref,copied,resources),
1474
1475 SeparationInfo = copyobject(pdfdoc,xref,copied,page,"SeparationInfo"),
1476 LMTX_Inclusion = getinclusion(pdfdoc),
1477 } + attr
1478 if attributes then
1479 for k, v in expanded(attributes) do
1480 page[k] = v
1481 end
1482 end
1483 local content = ""
1484 local nolength = nil
1485 if type(contents) == "string" then
1486 content = contents
1487 elseif contents then
1488 local ctype = contents.__type__
1489
1490
1491 if ctype == stream_object_code then
1492 if recompress then
1493 content = contents()
1494 else
1495 local Filter = copyobject(pdfdoc,xref,copied,contents,"Filter")
1496 local Length = copyobject(pdfdoc,xref,copied,contents,"Length")
1497 if Length and Filter then
1498 nolength = true
1499 xobject.Length = Length
1500 xobject.Filter = Filter
1501 content = contents(false)
1502 else
1503 content = contents()
1504 end
1505 end
1506 elseif ctype == array_object_code then
1507 content = { }
1508 for i=1,#contents do
1509 content[i] = contents[i]()
1510 end
1511 content = concat(content," ")
1512 end
1513 else
1514 content = ""
1515 end
1516
1517
1518 local rotation = pageinfo.rotation
1519 local boundingbox = pageinfo.boundingbox
1520 local transform = nil
1521 if rotation == 90 then
1522 transform = 3
1523 elseif rotation == 180 then
1524 transform = 2
1525 elseif rotation == 270 then
1526 transform = 1
1527 elseif rotation > 1 and rotation < 4 then
1528 transform = rotation
1529 end
1530 xobject.BBox = pdfarray {
1531 boundingbox[1] * bpfactor,
1532 boundingbox[2] * bpfactor,
1533 boundingbox[3] * bpfactor,
1534 boundingbox[4] * bpfactor,
1535 }
1536
1537 return createimage {
1538 bbox = boundingbox,
1539 transform = transform,
1540 nolength = nolength,
1541 nobbox = true,
1542 notype = true,
1543 stream = content,
1544 attr = xobject(),
1545 kind = images.types.stream,
1546 }
1547 else
1548 report("bad page %i in file %a",pagenumber or "0",pdfdoc.filename or "?")
1549 end
1550 end
1551 end
1552
1553 lpdf_epdf.image = {
1554 open = openpdf,
1555 close = closepdf,
1556 new = newpdf,
1557 query = querypdf,
1558 copy = copypage,
1559 }
1560
1561
1562
1563
1564
1565
1566
1567end end
1568
1569function lpdf_epdf.producer(pdfdoc)
1570 local producer = false
1571 if pdfdoc then
1572 local info = pdfdoc.Info
1573 if info then
1574 producer = info.Producer
1575 end
1576 if not producer then
1577 local metadata = pdfdoc.Catalog.Metadata
1578 if metadata then
1579 local x = xml.convert(metadata())
1580 if x then
1581 producer = xml.text(x,"rdf:Description/pdf:Producer")
1582 if not producer or producer == "" then
1583 producer = xml.text(x,"Producer")
1584 end
1585 end
1586 end
1587 end
1588 end
1589 return producer or ""
1590end
1591
1592function lpdf_epdf.expandwidths(widths,expanded)
1593 if not expanded then
1594 expanded = { }
1595 end
1596 local min = false
1597 local max = false
1598 local i = 1
1599 local n = #widths
1600 while i < n do
1601 local w1 = widths[i] ; i = i + 1
1602 local w2 = widths[i] ; i = i + 1
1603 if type(w2) == "table" then
1604 local k = 1
1605 local wn = w1 + #w2 - 1
1606 for j=w1,wn do
1607 expanded[j] = w2[k]
1608 k = k + 1
1609 end
1610 if not min then
1611 min = w1
1612 max = wn
1613 elseif wn > max then
1614 max = wn
1615 end
1616 else
1617 local w3 = widths[i] ; i = i + 1
1618 if w3 then
1619 for j=w1,w2 do
1620 expanded[j] = w3
1621 end
1622 if not min then
1623 min = w1
1624 max = w2
1625 elseif w2 > max then
1626 max = w2
1627 end
1628 end
1629 end
1630 end
1631 return expanded, min or 0, max or 0
1632end
1633
1634function lpdf_epdf.mergewidths(widths,expanded)
1635 if not expanded then
1636 expanded = { }
1637 end
1638 local min = 1
1639 local max = #widths
1640 for i=1,#widths do
1641 expanded[i] = widths[i]
1642 end
1643 return expanded, min, max
1644end
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658 |