Mercurial > hg > cc > cirrus_home
annotate bin/plinks.py @ 9:7a93e190c74d
logging tweaks, preparing for timeout on problem pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 25 Feb 2020 10:34:41 +0000 |
parents | 25ca3505b4d7 |
children | a33db8e3f51c |
rev | line source |
---|---|
0 | 1 #!/lustre/sw/miniconda3/bin/python3 |
7 | 2 import sys,pdfx,traceback,os |
3 from datetime import datetime | |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
4 from os import path |
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
5 |
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
6 def run(file): |
0 | 7 global pdf |
9
7a93e190c74d
logging tweaks, preparing for timeout on problem pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
7
diff
changeset
|
8 pdf=pdfx.PDFx(file,limit=30) |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
9 return pdf.get_references_as_dict() |
0 | 10 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
11 tarnum=sys.argv[1] |
7 | 12 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr) |
13 gf=0 | |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
14 with open('badpdfs_%s'%tarnum,'w') as bf: |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
15 for l in sys.stdin: |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
16 (fno,f)=l.split() |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
17 try: |
2 | 18 links=run(f) |
4
462179da7dc2
try harder not to write empty links files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
19 if bool(links) and (links.get('scrape',False) or |
462179da7dc2
try harder not to write empty links files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
2
diff
changeset
|
20 links.get('annot',False)): |
7 | 21 gf+=1 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
22 with open('links_%s_%s'%(tarnum,fno),'w') as of: |
3
cbd13beb0922
only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents:
2
diff
changeset
|
23 for k in links.keys(): |
cbd13beb0922
only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents:
2
diff
changeset
|
24 for l in links[k]: |
cbd13beb0922
only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents:
2
diff
changeset
|
25 print("%s\t%s"%(k,l),file=of) |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
26 except Exception as e: |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
27 if str(e)=='Unexpected EOF': |
7 | 28 print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(), |
29 tarnum,fno,e),file=bf) | |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
30 else: |
7 | 31 print("%s: "%(datetime.now().isoformat()),end='',file=bf) |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
32 traceback.print_exc(file=bf) |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
5
diff
changeset
|
33 |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
34 if (path.exists('/dev/shm/stopJob')): |
7 | 35 print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno), |
36 file=sys.stderr) | |
1
a4b0359456bc
switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents:
0
diff
changeset
|
37 exit(1) |
7 | 38 now=datetime.now().isoformat() |
39 print('%s: exiting from %s having found %s files with links out of %s'%(now, | |
40 tarnum, | |
41 gf, | |
42 fno)) |