diff lib/python/plinks.py @ 120:1d1bd22124c0

moved from bin
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Sep 2023 08:46:01 +0100
parents bin/plinks.py@38bab758e469
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/plinks.py	Thu Sep 28 08:46:01 2023 +0100
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import sys,pdfx,traceback,os
+from datetime import datetime
+
+def run(file):
+  try:
+    pdf=pdfx.PDFx(file)
+    links=pdf.get_references_as_dict()
+    if bool(links) and (links.get('scrape',False) or
+                        links.get('annot',False)):
+      for k in links.keys():
+        for l in links[k]:
+          print("%s\t%s"%(k,l))
+    else:
+      print("None")
+  except Exception as e:
+    if str(e)=='Unexpected EOF':
+      print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
+      print("badpdf")
+    else:
+      print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
+      traceback.print_exc(file=sys.stderr)
+
+if sys.argv[1]=='-':
+  i=0
+  for l in sys.stdin:
+    print(i,file=sys.stderr)
+    i+=1
+    f=l.rstrip()
+    if os.path.getsize(f)==1048576: # truncated
+      print("truncated",file=sys.stderr)
+      print("truncated")
+    else:
+      run(f)
+    os.unlink(f)
+else:
+  run(sys.argv[1])