view pdfCrawl.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children 0a3abe59e364
line wrap: on
line source

import PyPDF2 as pyPdf, sys

f = open(sys.argv[1],'rb')

pdf = pyPdf.PdfFileReader(f)
pgs = pdf.getNumPages()
key = '/Annots'
uri = '/URI'
ank = '/A'

#print pdf.getNamedDestinations()

for pg in range(pgs):
    print '#',pg
    p = pdf.getPage(pg)
    o = p.getObject()
    #print >>sys.stderr,o
    if o.has_key(key):
        ann = o[key]
        #print >>sys.stderr,key,ann
        for a in ann:
            u = a.getObject()
            if u[ank].has_key(uri):
                print "U",u[ank][uri]