Mercurial > hg > cc > cirrus_home
comparison bin/plinks.py @ 10:a33db8e3f51c
bigger run, longer limit
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 25 Feb 2020 14:56:36 +0000 |
parents | 7a93e190c74d |
children | b0d9fe66ce8a |
comparison
equal
deleted
inserted
replaced
9:7a93e190c74d | 10:a33db8e3f51c |
---|---|
3 from datetime import datetime | 3 from datetime import datetime |
4 from os import path | 4 from os import path |
5 | 5 |
6 def run(file): | 6 def run(file): |
7 global pdf | 7 global pdf |
8 pdf=pdfx.PDFx(file,limit=30) | 8 pdf=pdfx.PDFx(file,limit=60) |
9 return pdf.get_references_as_dict() | 9 return (pdf.get_references_as_dict(),pdf.limited) |
10 | 10 |
11 tarnum=sys.argv[1] | 11 tarnum=sys.argv[1] |
12 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) | 12 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) |
13 gf=0 | 13 gf=0 |
14 with open('badpdfs_%s'%tarnum,'w') as bf: | 14 with open('badpdfs_%s'%tarnum,'w') as bf: |
15 for l in sys.stdin: | 15 for l in sys.stdin: |
16 (fno,f)=l.split() | 16 (fno,f)=l.split() |
17 try: | 17 try: |
18 links=run(f) | 18 (links,limited)=run(f) |
19 if limited: | |
20 print("%s\t%s\tProcessing limited after timeout"%( | |
21 datetime.now().isoformat(),fno),file=bf) | |
19 if bool(links) and (links.get('scrape',False) or | 22 if bool(links) and (links.get('scrape',False) or |
20 links.get('annot',False)): | 23 links.get('annot',False)): |
21 gf+=1 | 24 gf+=1 |
22 with open('links_%s_%s'%(tarnum,fno),'w') as of: | 25 with open('links_%s_%s'%(tarnum,fno),'w') as of: |
23 for k in links.keys(): | 26 for k in links.keys(): |