Mercurial > hg > python
diff pdfCrawl.py @ 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children | 0a3abe59e364 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pdfCrawl.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,24 @@ +import PyPDF2 as pyPdf, sys + +f = open(sys.argv[1],'rb') + +pdf = pyPdf.PdfFileReader(f) +pgs = pdf.getNumPages() +key = '/Annots' +uri = '/URI' +ank = '/A' + +#print pdf.getNamedDestinations() + +for pg in range(pgs): + print '#',pg + p = pdf.getPage(pg) + o = p.getObject() + #print >>sys.stderr,o + if o.has_key(key): + ann = o[key] + #print >>sys.stderr,key,ann + for a in ann: + u = a.getObject() + if u[ank].has_key(uri): + print "U",u[ank][uri]