view plinks_jto.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
line wrap: on
line source

#!/usr/bin/python3
# Needs PYTHONPATH=/group/ltg/projects/lcontrib_sl7/usr/lib/python3.4/site-packages
import sys,pdfx,traceback

import types
if not isinstance(getattr(pdfx.backends.Reference,'__lt__'),types.FunctionType):
  def __lt__(self,other):
    assert isinstance(other, pdfx.backends.Reference)
    return self.ref < other.ref

  pdfx.backends.Reference.__lt__=__lt__

E=None

def run():
  global pdf, limited
  if sys.argv[1]=='-f':
    # flatten
    flatten=True
    sys.argv.pop(1)
  else:
    flatten=False
  try:
    pdf=pdfx.PDFx(sys.argv[1],**limited)
    if flatten:
      links=pdf.get_references(sort=True)
    else:
      links=pdf.get_references_as_dict(sort=True)
  except:
    traceback.print_exc()
    print("\nFailed: %s"%sys.argv[1],E,file=sys.stderr)
    exit(1)
  if pdf.limited:
    print("Timed out, no text or scraping",file=sys.stderr)
  if flatten:
    for l in links:
      print(l)
  else:
    for k in links.keys():
      for l in links[k]:
           print("%s\t%s"%(k,l))

limited={}
if sys.argv[1]=='-v':
  # verbose: log level debug
  sys.argv.pop(1)
  import logging
  logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s',
                      datefmt='%m/%d/%Y %I:%M:%S %p')
  logging.getLogger(name='stopit').addHandler(logging.StreamHandler(sys.stderr))
  logging.getLogger(name='pdfx').addHandler(logging.StreamHandler(sys.stderr))
  logging.getLogger(name='pdfminer').setLevel(logging.WARN)
  
if sys.argv[1]=='-r':
  # timeout for reading
  sys.argv.pop(1)
  limited['readTimeout']=float(sys.argv.pop(1))
if sys.argv[1]=='-t':
  # timeout for text recovery
  sys.argv.pop(1)
  limited['textTimeout']=float(sys.argv.pop(1))

if sys.argv[1]=='-x':
  import timeit
  sys.argv.pop(1)
  n=sys.argv[1]
  sys.argv.pop(1)
  print(timeit.timeit("run()",number=int(n),
                      setup="from __main__ import run"),file=sys.stderr)
else:
  run()