diff pdfCrawl.py @ 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children 0a3abe59e364
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pdfCrawl.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,24 @@
+import PyPDF2 as pyPdf, sys
+
+f = open(sys.argv[1],'rb')
+
+pdf = pyPdf.PdfFileReader(f)
+pgs = pdf.getNumPages()
+key = '/Annots'
+uri = '/URI'
+ank = '/A'
+
+#print pdf.getNamedDestinations()
+
+for pg in range(pgs):
+    print '#',pg
+    p = pdf.getPage(pg)
+    o = p.getObject()
+    #print >>sys.stderr,o
+    if o.has_key(key):
+        ann = o[key]
+        #print >>sys.stderr,key,ann
+        for a in ann:
+            u = a.getObject()
+            if u[ank].has_key(uri):
+                print "U",u[ank][uri]