1if not modules then modules = { } end modules ['lpdf-epd'] = {
2 version = 1.001,
3 comment = "companion to lpdf-epa.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files",
7 history = "this one replaces the poppler/pdfe binding",
8}
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44local setmetatable, type, next = setmetatable, type, next
45local tostring, unpack = tostring, unpack
46local char, byte, find = string.char, string.byte, string.find
47local abs = math.abs
48local concat, swapped, sortedhash, sortedkeys = table.concat, table.swapped, table.sortedhash, table.sortedkeys
49local utfchar = string.char
50local setmetatableindex = table.setmetatableindex
51local ioopen = io.open
52local octtointeger, dectointeger, hextointeger = string.octtointeger, string.dectointeger, string.hextointeger
53
54local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
55local P, C, S, R, Ct, Cc, V, Carg, Cs, Cf, Cg = lpeg.P, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cc, lpeg.V, lpeg.Carg, lpeg.Cs, lpeg.Cf, lpeg.Cg
56
57if not lpdf then
58 require("lpdf-aux")
59end
60
61if not (number and number.dimenfactors) then
62 require("util-dim")
63end
64
65local pdfe = pdfe
66 lpdf = lpdf or { }
67local lpdf = lpdf
68local lpdf_epdf = { }
69 lpdf.epdf = lpdf_epdf
70
71local pdfopenfile = pdfe.openfile
72local pdfnew = pdfe.new
73local pdfclose = pdfe.close
74
75local getcatalog = pdfe.getcatalog
76local getinfo = pdfe.getinfo
77local gettrailer = pdfe.gettrailer
78local getnofpages = pdfe.getnofpages
79local getversion = pdfe.getversion
80local getbox = pdfe.getbox
81local getstatus = pdfe.getstatus
82local unencrypt = pdfe.unencrypt
83local dictionarytotable = pdfe.dictionarytotable
84local arraytotable = pdfe.arraytotable
85local pagestotable = pdfe.pagestotable
86local readwholestream = pdfe.readwholestream
87local getfromreference = pdfe.getfromreference
88local getfromobject = pdfe.getfromobject
89local getobjectrange = pdfe.getobjectrange
90
91local report_epdf = logs.reporter("epdf")
92
93local allocate = utilities.storage.allocate
94
95local bpfactor = number.dimenfactors.bp
96
97local objectcodes = { [0] =
98 "none",
99 "null",
100 "bool",
101 "integer",
102 "number",
103 "name",
104 "string",
105 "array",
106 "dictionary",
107 "stream",
108 "reference",
109 "lpdf",
110}
111
112local encryptioncodes = {
113 [0] = "notencrypted",
114 [1] = "unencrypted",
115 [-1] = "protected",
116 [-2] = "failure",
117}
118
119objectcodes = allocate(swapped(objectcodes,objectcodes))
120encryptioncodes = allocate(swapped(encryptioncodes,encryptioncodes))
121
122lpdf_epdf.objectcodes = objectcodes
123lpdf_epdf.encryptioncodes = encryptioncodes
124
125local none_object_code = objectcodes.none
126local null_object_code = objectcodes.null
127local bool_object_code = objectcodes.bool
128local integer_object_code = objectcodes.integer
129local number_object_code = objectcodes.number
130local name_object_code = objectcodes.name
131local string_object_code = objectcodes.string
132local array_object_code = objectcodes.array
133local dictionary_object_code = objectcodes.dictionary
134local stream_object_code = objectcodes.stream
135local reference_object_code = objectcodes.reference
136local lpdf_object_code = objectcodes.lpdf
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152local some_dictionary
153local some_array
154local some_stream
155local some_reference
156
157local some_string = lpdf.frombytes
158
159function lpdf_epdf.objecttype(object)
160 if type(object) == "table" then
161 local kind = object.__type__
162 return kind and objectcodes[kind]
163 end
164end
165
166local function get_value(document,t,key)
167 if not key then
168 return
169 end
170 local value = t[key]
171 if not value then
172 return
173 end
174 if type(value) ~= "table" then
175 return value
176 end
177
178 local kind = value[1]
179 if kind == name_object_code then
180 return value[2]
181 elseif kind == string_object_code then
182 return some_string(value[2],value[3])
183 elseif kind == array_object_code then
184 return some_array(value[2],document)
185 elseif kind == dictionary_object_code then
186 return some_dictionary(value[2],document)
187 elseif kind == stream_object_code then
188 return some_stream(value,value[2],document)
189 elseif kind == reference_object_code then
190 return some_reference(value,document)
191 end
192 return value
193end
194
195local checked_access
196local get_flagged
197
198if lpdf.dictionary then
199
200
201
202 local pdfdictionary = lpdf.dictionary
203 local pdfarray = lpdf.array
204 local pdfconstant = lpdf.constant
205 local pdfreference = lpdf.reference
206 local pdfliteral = lpdf.literal
207
208 local copy_array, copy_dictionary
209
210 local function copyobject(object,key,value)
211 if not value then
212 value = object.__raw__[key]
213 end
214 local t = type(value)
215 if t == "string" then
216 return pdfconstant(value)
217 elseif t ~= "table" then
218 return value
219 end
220 local kind = value[1]
221 if kind == name_object_code then
222 return pdfconstant(value[2])
223 elseif kind == string_object_code then
224 return pdfliteral(value[2],value[3])
225 elseif kind == array_object_code then
226 return copyarray(object[key])
227 elseif kind == dictionary_object_code then
228 return copydictionary(object[key])
229 elseif kind == null_object_code then
230 return pdfnull()
231 elseif kind == reference_object_code then
232 return pdfreference(value[3])
233 else
234
235 end
236 end
237
238 copyarray = function(object)
239 local target = pdfarray()
240 local source = object.__raw__
241 for i=1,#source do
242 target[i] = copyobject(object,i,source[i])
243 end
244 return target
245 end
246
247 copydictionary = function(object)
248 local target = pdfdictionary()
249 local source = object.__raw__
250 for key, value in sortedhash(source) do
251 target[key] = copyobject(object,key,value)
252 end
253 return target
254 end
255
256 get_flagged = function(t,f)
257 local kind = t.__type__
258 if kind == name_object_code then
259 return pdfconstant(f)
260 elseif kind == array_object_code then
261 return copyarray(t)
262 elseif kind == dictionary_object_code then
263 return copydictionary(t)
264 elseif kind == stream_object_code then
265 return copydictionary(t)
266 elseif kind == string_object_code then
267 return pdfunicode(f)
268 elseif kind == null_object_code then
269 return pdfnull()
270 elseif kind == reference_object_code then
271 return pdfreference(t[3])
272 else
273 return f
274 end
275 end
276
277 function lpdf_epdf.verboseobject(document,n)
278 if document and n then
279 local object = document.objects[n]
280 if object then
281 local t = { n .. " 0 obj" }
282 if lpdf.epdf.objecttype(object) == "stream" then
283 t[#t+1] = object("dictionary")()
284 t[#t+1] = "stream"
285 t[#t+1] = tostring(object(true))
286 t[#t+1] = "endstream"
287 else
288 t[#t+1] = tostring(object())
289 end
290 t[#t+1] = "endobj"
291 return concat(t,"\n")
292 end
293 end
294 end
295
296else
297
298 get_flagged = function(t,f)
299 return t[k]
300 end
301
302end
303
304some_dictionary = function(d,document)
305 local f = dictionarytotable(d,true)
306 local t = setmetatable({ __raw__ = f, __type__ = dictionary_object_code }, {
307 __index = function(t,k)
308 return get_value(document,f,k)
309 end,
310 __call = function(t)
311 return get_flagged(t,f)
312 end,
313 } )
314 return t, "dictionary"
315end
316
317some_array = function(a,document)
318 local f = arraytotable(a,true)
319 local n = #f
320 local t = setmetatable({ __raw__ = f, __type__ = array_object_code, n = n }, {
321 __index = function(t,k)
322 return get_value(document,f,k)
323 end,
324 __call = function(t)
325 return get_flagged(t,f)
326 end,
327 __len = function(t,k)
328 return n
329 end,
330 } )
331 return t, "array"
332end
333
334some_stream = function(s,d,document)
335 local f = dictionarytotable(d,true)
336 local t = setmetatable({ __raw__ = f, __type__ = stream_object_code }, {
337 __index = function(t,k)
338 return get_value(document,f,k)
339 end,
340 __call = function(t,how)
341 if how == "dictionary" then
342 return get_flagged(t,f)
343 elseif how == false then
344 return readwholestream(s,false)
345 else
346 return readwholestream(s,true)
347 end
348 end,
349 } )
350 return t, "stream"
351end
352
353some_reference = function(r,document)
354 local objnum = r[3]
355 local cached = document.__cache__[objnum]
356 if not cached then
357 local kind, object, b, c = getfromreference(r[2])
358 if kind == dictionary_object_code then
359 cached = some_dictionary(object,document)
360 elseif kind == array_object_code then
361 cached = some_array(object,document)
362 elseif kind == stream_object_code then
363
364 cached = some_stream(object,b,document)
365 elseif kind == string_object_code then
366 cached = some_string(object,document)
367 else
368
369 cached = { kind, object, b, c }
370 end
371 document.__cache__[objnum] = cached
372 document.__xrefs__[cached] = objnum
373 end
374
375
376
377
378
379
380
381
382 return cached
383end
384
385local function some_object(document,n)
386 local kind, object, b, c = getfromobject(document.__data__,n)
387 if kind == dictionary_object_code then
388 return some_dictionary(object,document)
389 elseif kind == array_object_code then
390 return some_array(object,document)
391 elseif kind == stream_object_code then
392 return some_stream(object,b,document)
393 else
394
395 return { kind, object, b, c }
396 end
397end
398
399local resolvers = { }
400lpdf_epdf.resolvers = resolvers
401
402local function resolve(document,k)
403 local resolver = resolvers[k]
404 if resolver then
405 local entry = resolver(document)
406 document[k] = entry
407 return entry
408 end
409end
410
411local function getnames(document,n,target)
412 if n then
413 local Names = n.Names
414 if Names then
415 if not target then
416 target = { }
417 end
418 for i=1,#Names,2 do
419 target[Names[i]] = Names[i+1]
420 end
421 else
422 local Kids = n.Kids
423 if Kids then
424 for i=1,#Kids do
425 target = getnames(document,Kids[i],target)
426 end
427 end
428 end
429 return target
430 end
431end
432
433local function getkids(document,n,target)
434 if n then
435 local Kids = n.Kids
436 if Kids then
437 for i=1,#Kids do
438 target = getkids(document,Kids[i],target)
439 end
440 elseif target then
441 target[#target+1] = n
442 else
443 target = { n }
444 end
445 return target
446 end
447end
448
449function resolvers.destinations(document)
450 local Names = document.Catalog.Names
451 return getnames(document,Names and Names.Dests)
452end
453
454function resolvers.javascripts(document)
455 local Names = document.Catalog.Names
456 return getnames(document,Names and Names.JavaScript)
457end
458
459function resolvers.widgets(document)
460 local Names = document.Catalog.AcroForm
461 return Names and Names.Fields
462end
463
464function resolvers.embeddedfiles(document)
465 local Names = document.Catalog.Names
466 return getnames(document,Names and Names.EmbeddedFiles)
467end
468
469
470
471
472
473
474
475
476
477
478function resolvers.layers(document)
479 local properties = document.Catalog.OCProperties
480 if properties then
481 local layers = properties.OCGs
482 if layers then
483 local t = { }
484 for i=1,#layers do
485 local layer = layers[i]
486 t[i] = layer.Name
487 end
488
489 return t
490 end
491 end
492end
493
494function resolvers.structure(document)
495
496 return document.Catalog.StructTreeRoot
497end
498
499function resolvers.pages(document)
500 local __data__ = document.__data__
501 local __xrefs__ = document.__xrefs__
502 local __cache__ = document.__cache__
503
504 local nofpages = document.nofpages
505 local pages = { }
506 local rawpages = pagestotable(__data__)
507 document.pages = pages
508
509 for pagenumber=1,nofpages do
510 local rawpagedata = rawpages[pagenumber]
511 if rawpagedata then
512 local pagereference = rawpagedata[3]
513 local pageobject = rawpagedata[1]
514 local pagedata = some_dictionary(pageobject,document)
515 if pagedata and pageobject then
516 pagedata.number = pagenumber
517 pagedata.MediaBox = getbox(pageobject,"MediaBox")
518 pagedata.CropBox = getbox(pageobject,"CropBox")
519 pagedata.BleedBox = getbox(pageobject,"BleedBox")
520 pagedata.ArtBox = getbox(pageobject,"ArtBox")
521 pagedata.TrimBox = getbox(pageobject,"TrimBox")
522 pages[pagenumber] = pagedata
523 __xrefs__[pagedata] = pagereference
524 __cache__[pagereference] = pagedata
525 else
526 report_epdf("missing pagedata for page %i, case %i",pagenumber,1)
527 end
528 else
529 report_epdf("missing pagedata for page %i, case %i",pagenumber,2)
530 end
531 end
532
533
534
535 return pages
536end
537
538local loaded = { }
539local nofloaded = 0
540
541function lpdf_epdf.load(filename,userpassword,ownerpassword,fromstring)
542 local document = loaded[filename]
543 if not document then
544 statistics.starttiming(lpdf_epdf)
545 local __data__
546 local __file__
547 if fromstring then
548 __data__ = pdfnew(filename,#filename)
549 else
550 local f = ioopen(filename,"rb")
551 __data__ = f and pdfopenfile(f)
552 end
553 if __data__ then
554 if userpassword and getstatus(__data__) < 0 then
555 unencrypt(__data__,userpassword,nil)
556 end
557 if ownerpassword and getstatus(__data__) < 0 then
558 unencrypt(__data__,nil,ownerpassword)
559 end
560 if getstatus(__data__) < 0 then
561 report_epdf("the document is encrypted, provide proper passwords")
562 __data__ = false
563 end
564 if __data__ then
565 local __cache__ = { }
566 local __xrefs__ = { }
567 document = {
568 filename = filename,
569 nofcopied = 0,
570 copied = { },
571 __cache__ = __cache__,
572 __xrefs__ = __xrefs__,
573 __fonts__ = { },
574 __copied__ = { },
575 __data__ = __data__,
576 }
577 document.Catalog = some_dictionary(getcatalog(__data__),document)
578 document.Info = some_dictionary(getinfo(__data__),document)
579 document.Trailer = some_dictionary(gettrailer(__data__),document)
580
581 setmetatableindex(document,resolve)
582
583 document.majorversion, document.minorversion = getversion(__data__)
584
585 document.nofpages = getnofpages(__data__)
586
587
588 document.objects = setmetatableindex(function(t,objnum)
589 local kind = type(objnum)
590 if kind == "table" and objnum[1] == reference_object_code then
591 objnum = objnum[3]
592 kind = type(objnum)
593 end
594 if kind == "number" then
595 local cached = __cache__[objnum]
596 if not cached then
597 cached = some_object(document,objnum)
598 __cache__[objnum] = cached
599 __xrefs__[cached] = objnum
600 end
601 return cached
602 end
603 end)
604 else
605 document = false
606 end
607 else
608 if not __data_ then
609 report_epdf("the document is damaged or empty")
610 end
611 document = false
612 end
613 loaded[filename] = document
614 loaded[document] = document
615 statistics.stoptiming(lpdf_epdf)
616
617 end
618 if document then
619 nofloaded = nofloaded + 1
620 end
621 return document or nil
622end
623
624function lpdf_epdf.objectrange(filename,n)
625 local document = loaded[filename]
626 if document then
627 return getobjectrange(document.__data__,n)
628 end
629end
630
631function lpdf_epdf.unload(filename)
632 if type(filename) == "table" then
633 filename = filename.filename
634 end
635 if type(filename) == "string" then
636 local document = loaded[filename]
637 if document then
638 loaded[document] = nil
639 loaded[filename] = nil
640 pdfclose(document.__data__)
641 end
642 end
643end
644
645function lpdf.close(document)
646 if loaded[document] then
647 loaded[document] = nil
648 loaded[document.filename] = nil
649 pdfclose(document.__data__)
650 end
651end
652
653
654
655local function expanded(t)
656 local function iterator(raw,k)
657 local k, v = next(raw,k)
658 if v then
659 return k, t[k]
660 end
661 end
662 return iterator, t.__raw__, nil
663end
664
665
666lpdf_epdf.expanded = expanded
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687local h_hex_2 = lpdf.h_hex_2
688
689local remapper = {
690 ["\\("] = h_hex_2["("],
691 ["\\)"] = h_hex_2[")"],
692 ["\\n"] = h_hex_2["\n"],
693 ["\\r"] = h_hex_2["\r"],
694 ["\\t"] = h_hex_2["\t"],
695 ["\\b"] = h_hex_2["\b"],
696 ["\\f"] = h_hex_2["\f"],
697 ["\\\n"] = "",
698 ["\\\r"] = "",
699 ["\\\\"] = h_hex_2["\\"],
700}
701
702local p_remap = lpeg.utfchartabletopattern(remapper)
703
704setmetatableindex(remapper,function(t,k)
705 local v = h_hex_2[char(octtointeger(k))]
706 t[k] = v
707 return v
708end)
709
710local p_hex_string = Ct(Cc("hex") *
711 P("<")
712 * Cs((1 - P(">"))^1)
713 * P(">")
714)
715
716local p_dec_string = Ct(Cc("hex") *
717 P("(")
718 * Cs(
719 (
720 p_remap / remapper
721 + P("\\")/"" * ((lpegpatterns.octdigit * lpegpatterns.octdigit^-2) / remapper)
722 + P(1) / h_hex_2
723 - P(")")
724 )^0
725 )
726 * P(")")
727)
728
729local spaces = lpegpatterns.whitespace^1
730local optspaces = lpegpatterns.whitespace^0
731local comment = P("%") * (1 - lpegpatterns.newline)^0
732
733local numchar = P("\\")/"" * (R("09")^3/function(s) return char(octtointeger(s)) end)
734 + P("\\") * P(1)
735
736local key = P("/") * C(R("AZ","az","09","__","--")^1)
737local number = Ct(Cc("number") * (lpegpatterns.number/dectointeger))
738local keyword = Ct(Cc("name") * key)
739local operator = C((R("AZ","az")+P("*")+P("'")+P('"'))^1)
740
741local grammar = P { "start",
742 start = (comment + keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces)^1,
743 keyvalue = key * optspaces * V("start"),
744 dictionary = Ct(Cc("dict") * P("<<") * Ct(V("keyvalue")^1) * P(">>")),
745 array = Ct(Cc("array") * P("[") * Ct(V("start")^1) * P("]")),
746 hexstring = p_hex_string,
747 decstring = p_dec_string,
748}
749
750local operation = Ct(grammar^1 * operator + operator)
751local parser = Ct((operation + P(1))^1)
752
753local number = C(lpegpatterns.number)
754
755local fastgrammar = P { "start",
756 start = (comment + keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces)^1,
757 keyvalue = key * optspaces * V("start"),
758 dictionary = Ct(Cc("dict") * P("<<") * Ct(V("keyvalue")^1) * P(">>")),
759 array = Ct(Cc("array") * P("[") * Ct(V("start")^1) * P("]")),
760 hexstring = p_hex_string,
761 decstring = p_dec_string,
762}
763
764local fastoperation = Ct(fastgrammar^1 * operator + operator)
765local fastparser = Ct((fastoperation + P(1))^1)
766
767function lpdf_epdf.parsecontent(str,fast)
768 return lpegmatch(fast and fastparser or parser,str)
769end
770
771
772
773local numchar = P("\\") * (R("09")^3 + P(1))
774local number = lpegpatterns.number
775local keyword = P("/") * R("AZ","az","09","__")^1
776local operator = (R("AZ","az")+P("*")+P("'")+P('"'))^1
777
778local skipstart = P("BDC") + P("BMC") + P("DP") + P("MP")
779local skipstop = P("EMC")
780local skipkeep = P("/ActualText")
781
782local grammar = P { "skip",
783 start = keyword + number + V("dictionary") + V("array") + V("hexstring") + V("decstring") + spaces,
784 keyvalue = optspaces * (keyword * optspaces * V("start") * optspaces)^1,
785 xeyvalue = optspaces * ((keyword - skipkeep) * optspaces * V("start") * optspaces)^1,
786 array = P("[") * V("start")^0 * P("]"),
787 dictionary = P("<<") * V("keyvalue")^0 * P(">>"),
788 xictionary = P("<<") * V("xeyvalue")^0 * P(">>"),
789 hexstring = P("<") * ( 1-P(">"))^0 * P(">"),
790 decstring = P("(") * (numchar+1-(P")"))^0 * P(")"),
791 skip = (optspaces * ( keyword * optspaces * V("xictionary") * optspaces * skipstart + skipstop) / "")
792 + V("start")
793 + operator
794}
795
796local stripper = Cs((grammar + P(1))^1)
797
798function lpdf_epdf.stripcontent(str)
799 if find(str,"EMC") then
800 return lpegmatch(stripper,str)
801 else
802 return str
803 end
804end
805
806
807
808
809
810
811
812local fromsixteen = lpdf.fromsixteen
813
814local function f_bfchar(t,a,b)
815 t[hextointeger(a)] = fromsixteen(b)
816end
817
818local function f_bfrange_1(t,a,b,c)
819 print("todo 1",a,b,c)
820
821
822end
823
824local function f_bfrange_2(t,a,b,c)
825 print("todo 2",a,b,c)
826
827
828end
829
830local optionals = spaces^0
831local hexstring = optionals * P("<") * C((1-P(">"))^1) * P(">")
832local bfchar = Carg(1) * hexstring * hexstring / f_bfchar
833local bfrange = Carg(1) * hexstring * hexstring * hexstring / f_bfrange_1
834 + Carg(1) * hexstring * hexstring * optionals * P("[") * Ct(hexstring^1) * optionals * P("]") / f_bfrange_2
835local fromunicode = (
836 P("beginbfchar" ) * bfchar ^1 * optionals * P("endbfchar" ) +
837 P("beginbfrange") * bfrange^1 * optionals * P("endbfrange") +
838 spaces +
839 P(1)
840)^1 * Carg(1)
841
842lpdf_epdf.helpers = { }
843
844function lpdf_epdf.helpers.tounicodetable(tounicode)
845 return tounicode and lpegmatch(fromunicode,tounicode,1,{})
846end
847
848
849
850local function analyzefonts(document,resources)
851 local fonts = document.__fonts__
852 if resources then
853 local fontlist = resources.Font
854 if fontlist then
855 for id, data in expanded(fontlist) do
856 if not fonts[id] then
857
858
859 local tounicode = data.ToUnicode()
860 if tounicode then
861 tounicode = lpegmatch(fromunicode,tounicode,1,{})
862 end
863 fonts[id] = {
864 tounicode = type(tounicode) == "table" and tounicode or { }
865 }
866 setmetatableindex(fonts[id],"self")
867 end
868 end
869 end
870 end
871 return fonts
872end
873
874lpdf_epdf.analyzefonts = analyzefonts
875
876local more = 0
877local unic = nil
878
879local p_hex_to_utf = C(4) / function(s)
880 local now = hextointeger(s)
881 if more > 0 then
882 now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000
883 more = 0
884 return unic[now] or utfchar(now)
885 elseif now >= 0xD800 and now <= 0xDBFF then
886 more = now
887
888 else
889 return unic[now] or utfchar(now)
890 end
891end
892
893local p_dec_to_utf = C(1) / function(s)
894 local now = byte(s)
895 return unic[now] or utfchar(now)
896end
897
898local p_hex_to_utf = P(true) / function() more = 0 end * Cs(p_hex_to_utf^1)
899local p_dec_to_utf = P(true) / function() more = 0 end * Cs(p_dec_to_utf^1)
900
901
902
903local function allcontent(content)
904 if type(content) == "table" then
905 local ctype = content.__type__
906 if ctype == stream_object_code then
907 content = content()
908 elseif ctype == array_object_code then
909 local c = { }
910 for i=1,#content do
911 c[i] = content[i]()
912 end
913 content = concat(c," ")
914 end
915 end
916 return content
917end
918
919lpdf_epdf.allcontent = allcontent
920
921function lpdf_epdf.getpagecontent(document,pagenumber,asis,fast)
922
923 local page = document.pages[pagenumber]
924
925 if not page then
926 return
927 end
928
929 local content = allcontent(page.Contents or "")
930 local list = lpegmatch(fast and fastparser or parser,content)
931
932 if asis then
933 return list
934 end
935
936 local fonts = analyzefonts(document,page.Resources)
937 local font = nil
938
939
940 for i=1,#list do
941 local entry = list[i]
942 local size = #entry
943 local operator = entry[size]
944 if operator == "Tf" then
945 font = fonts[entry[1][2]]
946 unic = font and font.tounicode or { }
947 elseif operator == "TJ" then
948 local data = entry[1]
949 local list = data[2]
950
951 for i=1,#list do
952 local li = list[i]
953
954 local kind = li[1]
955 if kind == "hex" then
956 list[i] = lpegmatch(p_hex_to_utf,li[2])
957 elseif kind == "string" then
958 list[i] = lpegmatch(p_dec_to_utf,li[2])
959 else
960 list[i] = li[2]
961 end
962
963
964
965 end
966 elseif operator == "Tj" or operator == "'" or operator == '"' then
967
968 local data = entry[size-1]
969 local list = data[2]
970 local kind = list[1]
971 if kind == "hex" then
972 list[2] = lpegmatch(p_hex_to_utf,li[2])
973 elseif kind == "string" then
974 list[2] = lpegmatch(p_dec_to_utf,li[2])
975 end
976 end
977 end
978
979 unic = nil
980
981 return list
982
983end
984
985
986
987
988local softhyphen = utfchar(0xAD) .. "$"
989local linefactor = 1.3
990
991function lpdf_epdf.contenttotext(document,list)
992 local last_y = 0
993 local last_f = 0
994 local text = { }
995 local last = 0
996
997 for i=1,#list do
998 local entry = list[i]
999 local size = #entry
1000 local operator = entry[size]
1001 if operator == "Tf" then
1002 last_f = entry[2][2]
1003 elseif operator == "TJ" then
1004 local data = entry[1]
1005 local list = data[2]
1006 for i=1,#list do
1007 local li = list[i]
1008 local kind = type(li)
1009 if kind == "string" then
1010 last = last + 1
1011 text[last] = li
1012 elseif kind == "number" and li < -50 then
1013 last = last + 1
1014 text[last] = " "
1015 end
1016 end
1017 elseif operator == "Tj" then
1018 last = last + 1
1019 local li = entry[size-1]
1020 local kind = type(li)
1021 if kind == "string" then
1022 last = last + 1
1023 text[last] = li
1024 end
1025 elseif operator == "cm" or operator == "Tm" then
1026 local data = entry
1027 local ty = entry[6][2]
1028 local dy = abs(last_y - ty)
1029 if dy > linefactor*last_f then
1030 if last > 0 then
1031 if find(text[last],softhyphen,1,true) then
1032
1033 else
1034 last = last + 1
1035 text[last] = "\n"
1036 end
1037 end
1038 end
1039 last_y = ty
1040 end
1041 end
1042
1043 return concat(text)
1044end
1045
1046function lpdf_epdf.contenttostring(contents)
1047 local r = 0
1048 local result = { }
1049 local compact = false
1050 local rr = false
1051
1052 local flatten ; flatten = function(t)
1053 local nt = #t
1054 compact = t[nt] == "TJ"
1055 for i=1,nt do
1056 local ti = t[i]
1057 if type(ti) == "table" then
1058 local t1 = ti[1]
1059 local t2 = ti[2]
1060 if t1 == "array" then
1061 if compact then
1062 local sr, sresult = r, result
1063 r, result = 1, { "[" }
1064 flatten(t2)
1065 r = r + 1 ; result[r] = "]"
1066 sr = sr + 1; sresult[sr] = concat(result)
1067 r, result = sr, sresult
1068 else
1069 r = r + 1 ; result[r] = "["
1070 flatten(t2)
1071 r = r + 1 ; result[r] = "]"
1072 end
1073 elseif t1 == "dict" then
1074 r = r + 1 ; result[r] = "<<"
1075 flatten(t2)
1076 r = r + 1 ; result[r] = ">>"
1077 elseif t1 == "hex" then
1078 r = r + 1 ; result[r] = "<" .. t2 .. ">"
1079 elseif t1 == "dec" then
1080
1081 r = r + 1 ; result[r] = lpdf.toeight(t2)
1082 elseif type(t2) == "number" then
1083 r = r + 1 ; result[r] = t2
1084 else
1085 r = r + 1 ; result[r] = "/" .. t2
1086 end
1087 else
1088 r = r + 1 ; result[r] = ti
1089 end
1090 end
1091 end
1092
1093 for i=1,#contents do
1094 flatten(contents[i])
1095 end
1096 return concat(result," ")
1097end
1098
1099function lpdf_epdf.getstructure(document,list)
1100 local depth = 0
1101 for i=1,#list do
1102 local entry = list[i]
1103 local size = #entry
1104 local operator = entry[size]
1105 if operator == "BDC" then
1106 report_epdf("%w%s : %s",depth,entry[1] or "?",entry[2] and entry[2].MCID or "?")
1107 depth = depth + 1
1108 elseif operator == "EMC" then
1109 depth = depth - 1
1110 elseif operator == "TJ" then
1111 local list = entry[1]
1112 for i=1,#list do
1113 local li = list[i]
1114 if type(li) == "string" then
1115 report_epdf("%w > %s",depth,li)
1116 elseif li < -50 then
1117 report_epdf("%w >",depth,li)
1118 end
1119 end
1120 elseif operator == "Tj" then
1121 report_epdf("%w > %s",depth,entry[size-1])
1122 end
1123 end
1124end
1125
1126if images then do
1127
1128
1129
1130
1131 local recompress = false
1132 local stripmarked = false
1133
1134 local copydictionary = nil
1135 local copyarray = nil
1136
1137 local pdfreference = lpdf.reference
1138 local pdfconstant = lpdf.constant
1139 local pdfarray = lpdf.array
1140 local pdfdictionary = lpdf.dictionary
1141 local pdfnull = lpdf.null
1142 local pdfliteral = lpdf.literal
1143
1144 local pdfreserveobject = lpdf.reserveobject
1145 local shareobjectreference = lpdf.shareobjectreference
1146 local pdfflushobject = lpdf.flushobject
1147 local pdfflushstreamobject = lpdf.flushstreamobject
1148
1149 local report = logs.reporter("backend","xobjects")
1150
1151 local factor = 65536 / (7200/7227)
1152
1153 local createimage = images.create
1154
1155 directives.register("graphics.pdf.recompress", function(v) recompress = v end)
1156 directives.register("graphics.pdf.stripmarked", function(v) stripmarked = v end)
1157
1158 local function scaledbbox(b)
1159 return { b[1]*factor, b[2]*factor, b[3]*factor, b[4]*factor }
1160 end
1161
1162 local codecs = {
1163 ASCIIHexDecode = true,
1164 ASCII85Decode = true,
1165 RunLengthDecode = true,
1166 FlateDecode = true,
1167 LZWDecode = true,
1168 }
1169
1170 local function deepcopyobject(xref,copied,value)
1171
1172 local objnum = xref[value]
1173 if objnum then
1174 local usednum = copied[objnum]
1175 if usednum then
1176
1177 else
1178 usednum = pdfreserveobject()
1179 copied[objnum] = usednum
1180 local entry = value
1181 local kind = entry.__type__
1182 if kind == array_object_code then
1183 local a = copyarray(xref,copied,entry)
1184 pdfflushobject(usednum,tostring(a))
1185 elseif kind == dictionary_object_code then
1186 local d = copydictionary(xref,copied,entry)
1187 pdfflushobject(usednum,tostring(d))
1188 elseif kind == stream_object_code then
1189 local d = copydictionary(xref,copied,entry)
1190 local filter = d.Filter
1191 if filter and codecs[filter] and recompress then
1192
1193 d.Filter = nil
1194 d.Length = nil
1195 d.DecodeParms = nil
1196 d.DL = nil
1197 local s = entry()
1198 pdfflushstreamobject(s,d,true,usednum)
1199 else
1200
1201
1202 local s = entry(false)
1203
1204 pdfflushstreamobject(s,d,"raw",usednum)
1205 end
1206 else
1207 local t = type(value)
1208 if t == "string" then
1209 value = pdfconstant(value)
1210 elseif t == "table" then
1211 local kind = value[1]
1212 local entry = value[2]
1213 if kind == name_object_code then
1214 value = pdfconstant(entry)
1215 elseif kind == string_object_code then
1216 value = pdfliteral(entry,value[3])
1217 elseif kind == null_object_code then
1218 value = pdfnull()
1219 elseif kind == reference_object_code then
1220 value = deepcopyobject(xref,copied,entry)
1221 elseif entry == nil then
1222 value = pdfnull()
1223 else
1224 value = tostring(entry)
1225 end
1226 end
1227 pdfflushobject(usednum,value)
1228 end
1229 end
1230 return pdfreference(usednum)
1231 elseif kind == stream_object_code then
1232 report("stream not done: %s", objectcodes[kind] or "?")
1233 else
1234 report("object not done: %s", objectcodes[kind] or "?")
1235 end
1236 end
1237
1238 local function copyobject(xref,copied,object,key,value)
1239 if not value then
1240 value = object.__raw__[key]
1241 end
1242 local t = type(value)
1243 if t == "string" then
1244 return pdfconstant(value)
1245 elseif t ~= "table" then
1246 return value
1247 end
1248 local kind = value[1]
1249 if kind == name_object_code then
1250 return pdfconstant(value[2])
1251 elseif kind == string_object_code then
1252 return pdfliteral(value[2],value[3])
1253 elseif kind == array_object_code then
1254 return copyarray(xref,copied,object[key])
1255 elseif kind == dictionary_object_code then
1256 return copydictionary(xref,copied,object[key])
1257 elseif kind == null_object_code then
1258 return pdfnull()
1259 elseif kind == reference_object_code then
1260 return deepcopyobject(xref,copied,object[key])
1261 elseif kind == lpdf_object_code then
1262 return value[2]
1263 else
1264
1265 end
1266 end
1267
1268 copyarray = function(xref,copied,object)
1269 local target = pdfarray()
1270 local source = object.__raw__
1271 for i=1,#source do
1272 target[i] = copyobject(xref,copied,object,i,source[i])
1273 end
1274 return target
1275 end
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285 copydictionary = function (xref,copied,object)
1286 local target = pdfdictionary()
1287 local source = object.__raw__
1288
1289 for key, value in next, source do
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299 target[key] = copyobject(xref,copied,object,key,value)
1300
1301 end
1302 return target
1303 end
1304
1305 local openpdf = lpdf_epdf.load
1306 local closepdf = lpdf_epdf.unload
1307
1308
1309
1310 local function newpdf(str,userpassword,ownerpassword)
1311 return openpdf(str,userpassword,ownerpassword,true)
1312 end
1313
1314 local sizes = {
1315 crop = "CropBox",
1316 media = "MediaBox",
1317 bleed = "BleedBox",
1318 art = "ArtBox",
1319 trim = "TrimBox",
1320 }
1321
1322 local function querypdf(pdfdoc,pagenumber,size)
1323 if pdfdoc then
1324 if not pagenumber then
1325 pagenumber = 1
1326 end
1327 local root = pdfdoc.Catalog
1328 local page = pdfdoc.pages[pagenumber]
1329 if page then
1330 local sizetag = sizes[size or "crop"] or sizes.crop
1331 local mediabox = page.MediaBox or { 0, 0, 0, 0 }
1332 local cropbox = page[sizetag] or mediabox
1333 return {
1334 filename = pdfdoc.filename,
1335 pagenumber = pagenumber,
1336 nofpages = pdfdoc.nofpages,
1337 boundingbox = scaledbbox(cropbox),
1338 cropbox = cropbox,
1339 mediabox = mediabox,
1340 bleedbox = page.BleedBox or cropbox,
1341 trimbox = page.TrimBox or cropbox,
1342 artbox = page.ArtBox or cropbox,
1343 rotation = page.Rotate or 0,
1344 xsize = cropbox[3] - cropbox[1],
1345 ysize = cropbox[4] - cropbox[2],
1346 }
1347 end
1348 end
1349 end
1350
1351 local function copyresources(pdfdoc,xref,copied,Resources)
1352 if Resources then
1353 local d = copydictionary(xref,copied,Resources)
1354 return shareobjectreference(d)
1355 end
1356 end
1357
1358 local variables = interfaces.variables
1359
1360 local function copypage(pdfdoc,pagenumber,attributes,compact,width,height,attr,copymeta)
1361 if pdfdoc then
1362 local root = pdfdoc.Catalog
1363 local page = pdfdoc.pages[pagenumber or 1]
1364 local pageinfo = querypdf(pdfdoc,pagenumber)
1365 local contents = page.Contents
1366 if contents then
1367 local xref = pdfdoc.__xrefs__
1368 local copied = pdfdoc.__copied__
1369 local resources = page.Resources
1370 if compact and resources and lpdf_epdf.pageplugin then
1371 lpdf_epdf.pageplugin(pdfdoc,page,pagenumber,resources,compact)
1372 contents = page.Contents
1373 end
1374 local metadata = nil
1375
1376
1377
1378 if copymeta == variables.page or copymeta == variables.yes then
1379
1380 metadata = copyobject(xref,copied,page,"Metadata")
1381 end
1382 if not metadata and (copymeta == variables.document or copymeta == variables.yes) then
1383
1384 metadata = copyobject(xref,copied,root,"Metadata")
1385 end
1386 local xobject = pdfdictionary {
1387 Type = pdfconstant("XObject"),
1388 Subtype = pdfconstant("Form"),
1389 FormType = 1,
1390 Group = copyobject(xref,copied,page,"Group"),
1391 LastModified = copyobject(xref,copied,page,"LastModified"),
1392
1393 Metadata = metadata,
1394 PieceInfo = copyobject(xref,copied,page,"PieceInfo"),
1395 Resources = copyresources(pdfdoc,xref,copied,resources),
1396
1397 SeparationInfo = copyobject(xref,copied,page,"SeparationInfo"),
1398 } + attr
1399 if attributes then
1400 for k, v in expanded(attributes) do
1401 page[k] = v
1402 end
1403 end
1404 local content = ""
1405 local nolength = nil
1406 if type(contents) == "string" then
1407 content = contents
1408 else
1409 local ctype = contents.__type__
1410
1411
1412 if ctype == stream_object_code then
1413 if stripmarked then
1414 content = contents()
1415 local stripped = lpdf_epdf.stripcontent(content)
1416 if stripped ~= content then
1417
1418 content = stripped
1419 end
1420 elseif recompress then
1421 content = contents()
1422 else
1423 local Filter = copyobject(xref,copied,contents,"Filter")
1424 local Length = copyobject(xref,copied,contents,"Length")
1425 if Length and Filter then
1426 nolength = true
1427 xobject.Length = Length
1428 xobject.Filter = Filter
1429 content = contents(false)
1430 else
1431 content = contents()
1432 end
1433 end
1434 elseif ctype == array_object_code then
1435 content = { }
1436 for i=1,#contents do
1437 content[i] = contents[i]()
1438 end
1439 content = concat(content," ")
1440 end
1441 end
1442
1443
1444 local rotation = pageinfo.rotation
1445 local boundingbox = pageinfo.boundingbox
1446 local transform = nil
1447 if rotation == 90 then
1448 transform = 3
1449 elseif rotation == 180 then
1450 transform = 2
1451 elseif rotation == 270 then
1452 transform = 1
1453 elseif rotation > 1 and rotation < 4 then
1454 transform = rotation
1455 end
1456 xobject.BBox = pdfarray {
1457 boundingbox[1] * bpfactor,
1458 boundingbox[2] * bpfactor,
1459 boundingbox[3] * bpfactor,
1460 boundingbox[4] * bpfactor,
1461 }
1462
1463
1464 return createimage {
1465 bbox = boundingbox,
1466 transform = transform,
1467 nolength = nolength,
1468 nobbox = true,
1469 notype = true,
1470 stream = content,
1471 attr = xobject(),
1472 kind = images.types.stream,
1473 }
1474 else
1475
1476 end
1477 end
1478 end
1479
1480 lpdf_epdf.image = {
1481 open = openpdf,
1482 close = closepdf,
1483 new = newpdf,
1484 query = querypdf,
1485 copy = copypage,
1486 }
1487
1488
1489
1490
1491
1492
1493
1494end end
1495
1496function lpdf_epdf.producer(pdfdoc)
1497 local producer = false
1498 if pdfdoc then
1499 local info = pdfdoc.Info
1500 if info then
1501 producer = info.Producer
1502 end
1503 if not producer then
1504 local metadata = pdfdoc.Catalog.Metadata
1505 if metadata then
1506 local x = xml.convert(metadata())
1507 if x then
1508 producer = xml.text(x,"rdf:Description/pdf:Producer")
1509 if not producer or producer == "" then
1510 producer = xml.text(x,"Producer")
1511 end
1512 end
1513 end
1514 end
1515 end
1516 return producer or ""
1517end
1518
1519function lpdf_epdf.expandwidths(widths,expanded)
1520 if not expanded then
1521 expanded = { }
1522 end
1523 local min = false
1524 local max = false
1525 local i = 1
1526 local n = #widths
1527 while i < n do
1528 local w1 = widths[i] ; i = i + 1
1529 local w2 = widths[i] ; i = i + 1
1530 if type(w2) == "table" then
1531 local k = 1
1532 local wn = w1 + #w2 - 1
1533 for j=w1,wn do
1534 expanded[j] = w2[k]
1535 k = k + 1
1536 end
1537 if not min then
1538 min = w1
1539 max = wn
1540 elseif wn > max then
1541 max = wn
1542 end
1543 else
1544 local w3 = widths[i] ; i = i + 1
1545 if w3 then
1546 for j=w1,w2 do
1547 expanded[j] = w3
1548 end
1549 if not min then
1550 min = w1
1551 max = w2
1552 elseif w2 > max then
1553 max = w2
1554 end
1555 end
1556 end
1557 end
1558 return expanded, min or 0, max or 0
1559end
1560
1561function lpdf_epdf.mergewidths(widths,expanded)
1562 if not expanded then
1563 expanded = { }
1564 end
1565 local min = 1
1566 local max = #widths
1567 for i=1,#widths do
1568 expanded[i] = widths[i]
1569 end
1570 return expanded, min, max
1571end
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585 |