view pdfCrawl.py @ 25:0bbeb01a7681

combine from various sources
author Henry Thompson <ht@markup.co.uk>
date Sat, 29 May 2021 21:32:41 +0100
parents 2d7c91f89f6b
children
line wrap: on
line source

import PyPDF2 as pyPdf, sys

f = open(sys.argv[1],'rb')

pdf = pyPdf.PdfFileReader(f)
pgs = pdf.getNumPages()
key = '/Annots'
uri = '/URI'
ank = '/A'

#print pdf.getNamedDestinations()

for pg in range(pgs):
    print '#',pg
    p = pdf.getPage(pg)
    o = p.getObject()
    print o
    if o.has_key(key):
        ann = o[key]
        print key,len(ann),ann
        i=0
        for a in ann:
            u = a.getObject()
            if u[ank].has_key(uri):
                try:
                    print i,u[ank][uri]
                except UnicodeEncodeError:
                    print i,map(ord,u[ank][uri])
            i+=1