annotate lib/python/plinks.py @ 230:a0e2473deb33

post-processing
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 29 Feb 2024 15:01:02 +0000
parents 1d1bd22124c0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
22
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
2 import sys,pdfx,traceback,os
16
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 from datetime import datetime
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 def run(file):
22
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
6 try:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
7 pdf=pdfx.PDFx(file)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
8 links=pdf.get_references_as_dict()
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
9 if bool(links) and (links.get('scrape',False) or
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
10 links.get('annot',False)):
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
11 for k in links.keys():
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
12 for l in links[k]:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
13 print("%s\t%s"%(k,l))
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
14 else:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
15 print("None")
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
16 except Exception as e:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
17 if str(e)=='Unexpected EOF':
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
18 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
19 print("badpdf")
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
20 else:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
21 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
22 traceback.print_exc(file=sys.stderr)
16
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
22
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
24 if sys.argv[1]=='-':
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
25 i=0
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
26 for l in sys.stdin:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
27 print(i,file=sys.stderr)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
28 i+=1
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
29 f=l.rstrip()
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
30 if os.path.getsize(f)==1048576: # truncated
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
31 print("truncated",file=sys.stderr)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
32 print("truncated")
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
33 else:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
34 run(f)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
35 os.unlink(f)
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
36 else:
38bab758e469 accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 16
diff changeset
37 run(sys.argv[1])