diff pdfCrawl.py @ 1:0a3abe59e364

updated from more recent versions on origen
author Henry Thompson <ht@markup.co.uk>
date Mon, 09 Mar 2020 16:45:20 +0000
parents fee51ab07d09
children 2d7c91f89f6b
line wrap: on
line diff
--- a/pdfCrawl.py	Mon Mar 09 14:58:04 2020 +0000
+++ b/pdfCrawl.py	Mon Mar 09 16:45:20 2020 +0000
@@ -1,5 +1,11 @@
 import PyPDF2 as pyPdf, sys
 
+if sys.argv[1]=='-v':
+    verbose=True
+    sys.argv.pop(1)
+else:
+    verbose=False
+
 f = open(sys.argv[1],'rb')
 
 pdf = pyPdf.PdfFileReader(f)
@@ -20,5 +26,7 @@
         #print >>sys.stderr,key,ann
         for a in ann:
             u = a.getObject()
-            if u[ank].has_key(uri):
-                print "U",u[ank][uri]
+            if ank in u and uri in u[ank]:
+                if verbose:
+                    print u[ank]
+                print u[ank][uri]