Mercurial > hg > cc > cirrus_home
view bin/plinks.py @ 6:0f494c76a887
refactor to address tarred-up pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 23 Feb 2020 16:48:34 +0000 |
parents | a28d731977da |
children | 25ca3505b4d7 |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 import sys,pdfx,traceback from os import path def run(file): global pdf pdf=pdfx.PDFx(file) return pdf.get_references_as_dict() tarnum=sys.argv[1] with open('badpdfs_%s'%tarnum,'w') as bf: for l in sys.stdin: (fno,f)=l.split() try: links=run(f) if bool(links) and (links.get('scrape',False) or links.get('annot',False)): with open('links_%s_%s'%(tarnum,fno),'w') as of: for k in links.keys(): for l in links[k]: print("%s\t%s"%(k,l),file=of) except Exception as e: if str(e)=='Unexpected EOF': print("%s\t%s\t%s"%(tarnum,fno,e),file=bf) else: traceback.print_exc(file=bf) if (path.exists('/dev/shm/stopJob')): print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr) exit(1)