comparison pdfCrawl.py @ 4:2d7c91f89f6b

later ecclerig version
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 17:38:52 +0000
parents 0a3abe59e364
children
comparison
equal deleted inserted replaced
3:26d9c0308fcf 4:2d7c91f89f6b
1 import PyPDF2 as pyPdf, sys 1 import PyPDF2 as pyPdf, sys
2
3 if sys.argv[1]=='-v':
4 verbose=True
5 sys.argv.pop(1)
6 else:
7 verbose=False
8 2
9 f = open(sys.argv[1],'rb') 3 f = open(sys.argv[1],'rb')
10 4
11 pdf = pyPdf.PdfFileReader(f) 5 pdf = pyPdf.PdfFileReader(f)
12 pgs = pdf.getNumPages() 6 pgs = pdf.getNumPages()
18 12
19 for pg in range(pgs): 13 for pg in range(pgs):
20 print '#',pg 14 print '#',pg
21 p = pdf.getPage(pg) 15 p = pdf.getPage(pg)
22 o = p.getObject() 16 o = p.getObject()
23 #print >>sys.stderr,o 17 print o
24 if o.has_key(key): 18 if o.has_key(key):
25 ann = o[key] 19 ann = o[key]
26 #print >>sys.stderr,key,ann 20 print key,len(ann),ann
21 i=0
27 for a in ann: 22 for a in ann:
28 u = a.getObject() 23 u = a.getObject()
29 if ank in u and uri in u[ank]: 24 if u[ank].has_key(uri):
30 if verbose: 25 try:
31 print u[ank] 26 print i,u[ank][uri]
32 print u[ank][uri] 27 except UnicodeEncodeError:
28 print i,map(ord,u[ank][uri])
29 i+=1