Mercurial > hg > python
comparison pdfCrawl.py @ 4:2d7c91f89f6b
later ecclerig version
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 17:38:52 +0000 |
parents | 0a3abe59e364 |
children |
comparison
equal
deleted
inserted
replaced
3:26d9c0308fcf | 4:2d7c91f89f6b |
---|---|
1 import PyPDF2 as pyPdf, sys | 1 import PyPDF2 as pyPdf, sys |
2 | |
3 if sys.argv[1]=='-v': | |
4 verbose=True | |
5 sys.argv.pop(1) | |
6 else: | |
7 verbose=False | |
8 | 2 |
9 f = open(sys.argv[1],'rb') | 3 f = open(sys.argv[1],'rb') |
10 | 4 |
11 pdf = pyPdf.PdfFileReader(f) | 5 pdf = pyPdf.PdfFileReader(f) |
12 pgs = pdf.getNumPages() | 6 pgs = pdf.getNumPages() |
18 | 12 |
19 for pg in range(pgs): | 13 for pg in range(pgs): |
20 print '#',pg | 14 print '#',pg |
21 p = pdf.getPage(pg) | 15 p = pdf.getPage(pg) |
22 o = p.getObject() | 16 o = p.getObject() |
23 #print >>sys.stderr,o | 17 print o |
24 if o.has_key(key): | 18 if o.has_key(key): |
25 ann = o[key] | 19 ann = o[key] |
26 #print >>sys.stderr,key,ann | 20 print key,len(ann),ann |
21 i=0 | |
27 for a in ann: | 22 for a in ann: |
28 u = a.getObject() | 23 u = a.getObject() |
29 if ank in u and uri in u[ank]: | 24 if u[ank].has_key(uri): |
30 if verbose: | 25 try: |
31 print u[ank] | 26 print i,u[ank][uri] |
32 print u[ank][uri] | 27 except UnicodeEncodeError: |
28 print i,map(ord,u[ank][uri]) | |
29 i+=1 |