annotate plinks_jto.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
69
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Needs PYTHONPATH=/group/ltg/projects/lcontrib_sl7/usr/lib/python3.4/site-packages
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 import sys,pdfx,traceback
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import types
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 if not isinstance(getattr(pdfx.backends.Reference,'__lt__'),types.FunctionType):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 def __lt__(self,other):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 assert isinstance(other, pdfx.backends.Reference)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 return self.ref < other.ref
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 pdfx.backends.Reference.__lt__=__lt__
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 E=None
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 def run():
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 global pdf, limited
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 if sys.argv[1]=='-f':
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 # flatten
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 flatten=True
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 flatten=False
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 try:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 pdf=pdfx.PDFx(sys.argv[1],**limited)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 if flatten:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 links=pdf.get_references(sort=True)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 links=pdf.get_references_as_dict(sort=True)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 except:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 traceback.print_exc()
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 print("\nFailed: %s"%sys.argv[1],E,file=sys.stderr)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 exit(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 if pdf.limited:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 print("Timed out, no text or scraping",file=sys.stderr)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 if flatten:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 for l in links:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 print(l)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 for k in links.keys():
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 for l in links[k]:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 print("%s\t%s"%(k,l))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 limited={}
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 if sys.argv[1]=='-v':
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 # verbose: log level debug
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 import logging
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s',
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 datefmt='%m/%d/%Y %I:%M:%S %p')
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 logging.getLogger(name='stopit').addHandler(logging.StreamHandler(sys.stderr))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 logging.getLogger(name='pdfx').addHandler(logging.StreamHandler(sys.stderr))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 logging.getLogger(name='pdfminer').setLevel(logging.WARN)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 if sys.argv[1]=='-r':
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 # timeout for reading
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 limited['readTimeout']=float(sys.argv.pop(1))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 if sys.argv[1]=='-t':
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 # timeout for text recovery
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 limited['textTimeout']=float(sys.argv.pop(1))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 if sys.argv[1]=='-x':
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 import timeit
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 n=sys.argv[1]
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 sys.argv.pop(1)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 print(timeit.timeit("run()",number=int(n),
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 setup="from __main__ import run"),file=sys.stderr)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 run()