Mercurial > hg > cc > cirrus_work
annotate bin/plinks.py @ 93:25bd398a8035
improve reordering, still failing on cdx-00004
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 Sep 2023 18:51:21 +0100 |
parents | 38bab758e469 |
children |
rev | line source |
---|---|
16
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
22
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
2 import sys,pdfx,traceback,os |
16
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 from datetime import datetime |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 def run(file): |
22
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
6 try: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
7 pdf=pdfx.PDFx(file) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
8 links=pdf.get_references_as_dict() |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
9 if bool(links) and (links.get('scrape',False) or |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
10 links.get('annot',False)): |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
11 for k in links.keys(): |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
12 for l in links[k]: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
13 print("%s\t%s"%(k,l)) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
14 else: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
15 print("None") |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
16 except Exception as e: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
17 if str(e)=='Unexpected EOF': |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
18 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
19 print("badpdf") |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
20 else: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
21 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
22 traceback.print_exc(file=sys.stderr) |
16
04464ee31d66
toward link extractions from pdf
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
22
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
24 if sys.argv[1]=='-': |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
25 i=0 |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
26 for l in sys.stdin: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
27 print(i,file=sys.stderr) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
28 i+=1 |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
29 f=l.rstrip() |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
30 if os.path.getsize(f)==1048576: # truncated |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
31 print("truncated",file=sys.stderr) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
32 print("truncated") |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
33 else: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
34 run(f) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
35 os.unlink(f) |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
36 else: |
38bab758e469
accept filenames on stdin,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
16
diff
changeset
|
37 run(sys.argv[1]) |