comparison bin/plinks.py @ 10:a33db8e3f51c

bigger run, longer limit
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 25 Feb 2020 14:56:36 +0000
parents 7a93e190c74d
children b0d9fe66ce8a
comparison
equal deleted inserted replaced
9:7a93e190c74d 10:a33db8e3f51c
3 from datetime import datetime 3 from datetime import datetime
4 from os import path 4 from os import path
5 5
6 def run(file): 6 def run(file):
7 global pdf 7 global pdf
8 pdf=pdfx.PDFx(file,limit=30) 8 pdf=pdfx.PDFx(file,limit=60)
9 return pdf.get_references_as_dict() 9 return (pdf.get_references_as_dict(),pdf.limited)
10 10
11 tarnum=sys.argv[1] 11 tarnum=sys.argv[1]
12 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) 12 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr)
13 gf=0 13 gf=0
14 with open('badpdfs_%s'%tarnum,'w') as bf: 14 with open('badpdfs_%s'%tarnum,'w') as bf:
15 for l in sys.stdin: 15 for l in sys.stdin:
16 (fno,f)=l.split() 16 (fno,f)=l.split()
17 try: 17 try:
18 links=run(f) 18 (links,limited)=run(f)
19 if limited:
20 print("%s\t%s\tProcessing limited after timeout"%(
21 datetime.now().isoformat(),fno),file=bf)
19 if bool(links) and (links.get('scrape',False) or 22 if bool(links) and (links.get('scrape',False) or
20 links.get('annot',False)): 23 links.get('annot',False)):
21 gf+=1 24 gf+=1
22 with open('links_%s_%s'%(tarnum,fno),'w') as of: 25 with open('links_%s_%s'%(tarnum,fno),'w') as of:
23 for k in links.keys(): 26 for k in links.keys():