view bin/plinks.py @ 55:50556ac15e88

one-off to convert big extracts.tar into lots of smaller ones
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 16:10:22 +0100
parents b0d9fe66ce8a
children 3bc1d24363a1
line wrap: on
line source

#!/lustre/sw/miniconda3/bin/python3
import sys,pdfx,traceback,os
from datetime import datetime
from os import path

def run(file):
  global pdf
  pdf=pdfx.PDFx(file,limit=60)
  return (pdf.get_references_as_dict(),pdf.limited)

tarnum=sys.argv[1]
print(tarnum, sys.argv, os.getcwd(),file=sys.stderr)
gf=0
with open('badpdfs_%s'%tarnum,'w') as bf:
  for l in sys.stdin:
    (fno,f)=l.split()
    try:
      (links,limited)=run(f)
      if limited:
        print("%s\t%s\tProcessing limited after timeout"%(
          datetime.now().isoformat(),fno),file=bf)
        bf.flush()
      if bool(links) and (links.get('scrape',False) or
                          links.get('annot',False)):
        gf+=1
        with open('links_%s_%s'%(tarnum,fno),'w') as of:
          for k in links.keys():
            for l in links[k]:
              print("%s\t%s"%(k,l),file=of)
    except Exception as e:
      if str(e)=='Unexpected EOF':
        print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(),
                                 tarnum,fno,e),file=bf)
        bf.flush()
      else:
        print("%s: "%(datetime.now().isoformat()),end='',file=bf)
        traceback.print_exc(file=bf)
        bf.flush()
    if path.exists('stopJob'):
      print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno),
            file=sys.stderr)
      sys.stderr.flush()
      exit(1)          
now=datetime.now().isoformat()
print('%s: exiting from %s having found %s files with links out of %s'%(now,
                                                                        tarnum,
                                                                        gf,
                                                                        fno))