comparison bin/plinks.py @ 22:38bab758e469

accept filenames on stdin, check for 1M => truncation, always produce some output even if no links
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 29 Sep 2022 16:36:52 +0100
parents 04464ee31d66
children
comparison
equal deleted inserted replaced
21:cbac7dfe2f24 22:38bab758e469
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 import sys,pdfx,traceback 2 import sys,pdfx,traceback,os
3 from datetime import datetime 3 from datetime import datetime
4 4
5 def run(file): 5 def run(file):
6 global pdf 6 try:
7 pdf=pdfx.PDFx(file) 7 pdf=pdfx.PDFx(file)
8 return pdf.get_references_as_dict() 8 links=pdf.get_references_as_dict()
9 if bool(links) and (links.get('scrape',False) or
10 links.get('annot',False)):
11 for k in links.keys():
12 for l in links[k]:
13 print("%s\t%s"%(k,l))
14 else:
15 print("None")
16 except Exception as e:
17 if str(e)=='Unexpected EOF':
18 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr)
19 print("badpdf")
20 else:
21 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr)
22 traceback.print_exc(file=sys.stderr)
9 23
10 f=sys.argv[1] 24 if sys.argv[1]=='-':
11 25 i=0
12 try: 26 for l in sys.stdin:
13 links=run(f) 27 print(i,file=sys.stderr)
14 if bool(links) and (links.get('scrape',False) or 28 i+=1
15 links.get('annot',False)): 29 f=l.rstrip()
16 for k in links.keys(): 30 if os.path.getsize(f)==1048576: # truncated
17 for l in links[k]: 31 print("truncated",file=sys.stderr)
18 print("%s\t%s"%(k,l)) 32 print("truncated")
19 except Exception as e: 33 else:
20 if str(e)=='Unexpected EOF': 34 run(f)
21 print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) 35 os.unlink(f)
22 else: 36 else:
23 print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) 37 run(sys.argv[1])
24 traceback.print_exc(file=sys.stderr)
25