annotate bin/plinks.py @ 166:afd7879181c9

old style
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 19:15:20 +0100
parents 3bc1d24363a1
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
127
3bc1d24363a1 bits and pieces
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
1 #!/usr/bin/env python3
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
2 import sys,pdfx,traceback,os
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
3 from datetime import datetime
1
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
4 from os import path
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
5
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
6 def run(file):
0
65a56c0d1c1f bolting the barn door...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 global pdf
127
3bc1d24363a1 bits and pieces
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
8 pdf=pdfx.PDFx(file#,limit=60
3bc1d24363a1 bits and pieces
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
9 )
3bc1d24363a1 bits and pieces
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
10 return (pdf.get_references_as_dict(),False#pdf.limited
3bc1d24363a1 bits and pieces
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 11
diff changeset
11 )
0
65a56c0d1c1f bolting the barn door...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
13 tarnum=sys.argv[1]
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
14 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr)
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
15 gf=0
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
16 with open('badpdfs_%s'%tarnum,'w') as bf:
1
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
17 for l in sys.stdin:
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
18 (fno,f)=l.split()
1
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
19 try:
10
a33db8e3f51c bigger run, longer limit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 9
diff changeset
20 (links,limited)=run(f)
a33db8e3f51c bigger run, longer limit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 9
diff changeset
21 if limited:
a33db8e3f51c bigger run, longer limit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 9
diff changeset
22 print("%s\t%s\tProcessing limited after timeout"%(
a33db8e3f51c bigger run, longer limit
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 9
diff changeset
23 datetime.now().isoformat(),fno),file=bf)
11
b0d9fe66ce8a give up on mpiexec_mpt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 10
diff changeset
24 bf.flush()
4
462179da7dc2 try harder not to write empty links files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
25 if bool(links) and (links.get('scrape',False) or
462179da7dc2 try harder not to write empty links files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 2
diff changeset
26 links.get('annot',False)):
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
27 gf+=1
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
28 with open('links_%s_%s'%(tarnum,fno),'w') as of:
3
cbd13beb0922 only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents: 2
diff changeset
29 for k in links.keys():
cbd13beb0922 only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents: 2
diff changeset
30 for l in links[k]:
cbd13beb0922 only create links file if there are some
Henry Thompson <ht@markup.co.uk>
parents: 2
diff changeset
31 print("%s\t%s"%(k,l),file=of)
1
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
32 except Exception as e:
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
33 if str(e)=='Unexpected EOF':
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
34 print("%s:\t%s\t%s\t%s"%(datetime.now().isoformat(),
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
35 tarnum,fno,e),file=bf)
11
b0d9fe66ce8a give up on mpiexec_mpt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 10
diff changeset
36 bf.flush()
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
37 else:
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
38 print("%s: "%(datetime.now().isoformat()),end='',file=bf)
6
0f494c76a887 refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 5
diff changeset
39 traceback.print_exc(file=bf)
11
b0d9fe66ce8a give up on mpiexec_mpt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 10
diff changeset
40 bf.flush()
b0d9fe66ce8a give up on mpiexec_mpt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 10
diff changeset
41 if path.exists('stopJob'):
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
42 print("%s: Quiting early: %s %s"%(datetime.now().isoformat(),tarnum,fno),
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
43 file=sys.stderr)
11
b0d9fe66ce8a give up on mpiexec_mpt
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 10
diff changeset
44 sys.stderr.flush()
1
a4b0359456bc switch to file loop inside python, assume file index integer in pipe as well as filename, check /dev/shm/stopJob
Henry Thompson <ht@markup.co.uk>
parents: 0
diff changeset
45 exit(1)
7
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
46 now=datetime.now().isoformat()
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
47 print('%s: exiting from %s having found %s files with links out of %s'%(now,
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
48 tarnum,
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
49 gf,
25ca3505b4d7 more logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 6
diff changeset
50 fno))