diff plinks_jto.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plinks_jto.py	Fri Jan 17 15:45:26 2025 +0000
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+# Needs PYTHONPATH=/group/ltg/projects/lcontrib_sl7/usr/lib/python3.4/site-packages
+import sys,pdfx,traceback
+
+import types
+if not isinstance(getattr(pdfx.backends.Reference,'__lt__'),types.FunctionType):
+  def __lt__(self,other):
+    assert isinstance(other, pdfx.backends.Reference)
+    return self.ref < other.ref
+
+  pdfx.backends.Reference.__lt__=__lt__
+
+E=None
+
+def run():
+  global pdf, limited
+  if sys.argv[1]=='-f':
+    # flatten
+    flatten=True
+    sys.argv.pop(1)
+  else:
+    flatten=False
+  try:
+    pdf=pdfx.PDFx(sys.argv[1],**limited)
+    if flatten:
+      links=pdf.get_references(sort=True)
+    else:
+      links=pdf.get_references_as_dict(sort=True)
+  except:
+    traceback.print_exc()
+    print("\nFailed: %s"%sys.argv[1],E,file=sys.stderr)
+    exit(1)
+  if pdf.limited:
+    print("Timed out, no text or scraping",file=sys.stderr)
+  if flatten:
+    for l in links:
+      print(l)
+  else:
+    for k in links.keys():
+      for l in links[k]:
+           print("%s\t%s"%(k,l))
+
+limited={}
+if sys.argv[1]=='-v':
+  # verbose: log level debug
+  sys.argv.pop(1)
+  import logging
+  logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s',
+                      datefmt='%m/%d/%Y %I:%M:%S %p')
+  logging.getLogger(name='stopit').addHandler(logging.StreamHandler(sys.stderr))
+  logging.getLogger(name='pdfx').addHandler(logging.StreamHandler(sys.stderr))
+  logging.getLogger(name='pdfminer').setLevel(logging.WARN)
+  
+if sys.argv[1]=='-r':
+  # timeout for reading
+  sys.argv.pop(1)
+  limited['readTimeout']=float(sys.argv.pop(1))
+if sys.argv[1]=='-t':
+  # timeout for text recovery
+  sys.argv.pop(1)
+  limited['textTimeout']=float(sys.argv.pop(1))
+
+if sys.argv[1]=='-x':
+  import timeit
+  sys.argv.pop(1)
+  n=sys.argv[1]
+  sys.argv.pop(1)
+  print(timeit.timeit("run()",number=int(n),
+                      setup="from __main__ import run"),file=sys.stderr)
+else:
+  run()