Mercurial > hg > cc > cirrus_work
annotate bin/plinks.py @ 16:04464ee31d66
toward link extractions from pdf
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 07 Aug 2022 13:56:49 +0100 |
parents | |
children | 38bab758e469 |
rev | line source |
---|---|
16
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 import sys,pdfx,traceback |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 from datetime import datetime |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 def run(file): |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 global pdf |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 pdf=pdfx.PDFx(file) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 return pdf.get_references_as_dict() |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 f=sys.argv[1] |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 try: |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 links=run(f) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 if bool(links) and (links.get('scrape',False) or |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 links.get('annot',False)): |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 for k in links.keys(): |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 for l in links[k]: |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 print("%s\t%s"%(k,l)) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 except Exception as e: |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 if str(e)=='Unexpected EOF': |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 else: |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 traceback.print_exc(file=sys.stderr) |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |