changeset 16:04464ee31d66

toward link extractions from pdf
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 07 Aug 2022 13:56:49 +0100
parents a9763cd18949
children 75e0d0013da0
files bin/plinks.py
diffstat 1 files changed, 25 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/plinks.py	Sun Aug 07 13:56:49 2022 +0100
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+import sys,pdfx,traceback
+from datetime import datetime
+
+def run(file):
+  global pdf
+  pdf=pdfx.PDFx(file)
+  return pdf.get_references_as_dict()
+
+f=sys.argv[1]
+
+try:
+  links=run(f)
+  if bool(links) and (links.get('scrape',False) or
+                      links.get('annot',False)):
+    for k in links.keys():
+      for l in links[k]:
+        print("%s\t%s"%(k,l))
+except Exception as e:
+  if str(e)=='Unexpected EOF':
+    print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
+  else:
+    print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
+    traceback.print_exc(file=sys.stderr)
+