Mercurial > hg > python
view pdfCrawl.py @ 1:0a3abe59e364
updated from more recent versions on origen
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 09 Mar 2020 16:45:20 +0000 |
parents | fee51ab07d09 |
children | 2d7c91f89f6b |
line wrap: on
line source
import PyPDF2 as pyPdf, sys if sys.argv[1]=='-v': verbose=True sys.argv.pop(1) else: verbose=False f = open(sys.argv[1],'rb') pdf = pyPdf.PdfFileReader(f) pgs = pdf.getNumPages() key = '/Annots' uri = '/URI' ank = '/A' #print pdf.getNamedDestinations() for pg in range(pgs): print '#',pg p = pdf.getPage(pg) o = p.getObject() #print >>sys.stderr,o if o.has_key(key): ann = o[key] #print >>sys.stderr,key,ann for a in ann: u = a.getObject() if ank in u and uri in u[ank]: if verbose: print u[ank] print u[ank][uri]