Mercurial > hg > cc > cirrus_home
view bin/plinks.py @ 93:4d870a7ec871
support a command to receive each result,
remove use of X-Crawler-Content-Length
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Apr 2021 10:59:25 +0000 |
parents | b0d9fe66ce8a |
children | 3bc1d24363a1 |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 import sys,pdfx,traceback,os from datetime import datetime from os import path def run(file): global pdf pdf=pdfx.PDFx(file,limit=60) return (pdf.get_references_as_dict(),pdf.limited) tarnum=sys.argv[1] print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) gf=0 with open('badpdfs_%s'%tarnum,'w') as bf: for l in sys.stdin: (fno,f)=l.split() try: (links,limited)=run(f) if limited: print("%s\t%s\tProcessing limited after timeout"%( datetime.now().isoformat(),fno),file=bf) bf.flush() if bool(links) and (links.get('scrape',False) or links.get('annot',False)): gf+=1 with open('links_%s_%s'%(tarnum,fno),'w') as of: for k in links.keys(): for l in links[k]: print("%s\t%s"%(k,l),file=of) except Exception as e: if str(e)=='Unexpected EOF': print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(), tarnum,fno,e),file=bf) bf.flush() else: print("%s: "%(datetime.now().isoformat()),end='',file=bf) traceback.print_exc(file=bf) bf.flush() if path.exists('stopJob'): print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno), file=sys.stderr) sys.stderr.flush() exit(1) now=datetime.now().isoformat() print('%s: exiting from %s having found %s files with links out of %s'%(now, tarnum, gf, fno))