view bin/plinks.py @ 6:0f494c76a887

refactor to address tarred-up pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 23 Feb 2020 16:48:34 +0000
parents a28d731977da
children 25ca3505b4d7
line wrap: on
line source

#!/lustre/sw/miniconda3/bin/python3
import sys,pdfx,traceback
from os import path

def run(file):
  global pdf
  pdf=pdfx.PDFx(file)
  return pdf.get_references_as_dict()

tarnum=sys.argv[1]
with open('badpdfs_%s'%tarnum,'w') as bf:
  for l in sys.stdin:
    (fno,f)=l.split()
    try:
      links=run(f)
      if bool(links) and (links.get('scrape',False) or
                          links.get('annot',False)):
        with open('links_%s_%s'%(tarnum,fno),'w') as of:
          for k in links.keys():
            for l in links[k]:
              print("%s\t%s"%(k,l),file=of)
    except Exception as e:
      if str(e)=='Unexpected EOF':
        print("%s\t%s\t%s"%(tarnum,fno,e),file=bf)
      else:
        traceback.print_exc(file=bf)

    if (path.exists('/dev/shm/stopJob')):
      print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr)
      exit(1)