annotate bin/plinks.py @ 16:04464ee31d66

toward link extractions from pdf
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 07 Aug 2022 13:56:49 +0100
parents
children 38bab758e469
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 import sys,pdfx,traceback
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 from datetime import datetime
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 def run(file):
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 global pdf
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 pdf=pdfx.PDFx(file)
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 return pdf.get_references_as_dict()
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 f=sys.argv[1]
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 try:
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 links=run(f)
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 if bool(links) and (links.get('scrape',False) or
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 links.get('annot',False)):
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 for k in links.keys():
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 for l in links[k]:
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 print("%s\t%s"%(k,l))
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 except Exception as e:
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 if str(e)=='Unexpected EOF':
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 else:
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 traceback.print_exc(file=sys.stderr)
04464ee31d66 toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25