Mercurial > hg > python
annotate pdfCrawl.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | 2d7c91f89f6b |
children |
rev | line source |
---|---|
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 import PyPDF2 as pyPdf, sys |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 f = open(sys.argv[1],'rb') |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 pdf = pyPdf.PdfFileReader(f) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 pgs = pdf.getNumPages() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 key = '/Annots' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 uri = '/URI' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 ank = '/A' |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 #print pdf.getNamedDestinations() |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 for pg in range(pgs): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 print '#',pg |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 p = pdf.getPage(pg) |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 o = p.getObject() |
4 | 17 print o |
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 if o.has_key(key): |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 ann = o[key] |
4 | 20 print key,len(ann),ann |
21 i=0 | |
0
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 for a in ann: |
fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 u = a.getObject() |
4 | 24 if u[ank].has_key(uri): |
25 try: | |
26 print i,u[ank][uri] | |
27 except UnicodeEncodeError: | |
28 print i,map(ord,u[ank][uri]) | |
29 i+=1 |