Mercurial > hg > cc > cirrus_work
changeset 16:04464ee31d66
toward link extractions from pdf
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 07 Aug 2022 13:56:49 +0100 |
parents | a9763cd18949 |
children | 75e0d0013da0 |
files | bin/plinks.py |
diffstat | 1 files changed, 25 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/plinks.py Sun Aug 07 13:56:49 2022 +0100 @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +import sys,pdfx,traceback +from datetime import datetime + +def run(file): + global pdf + pdf=pdfx.PDFx(file) + return pdf.get_references_as_dict() + +f=sys.argv[1] + +try: + links=run(f) + if bool(links) and (links.get('scrape',False) or + links.get('annot',False)): + for k in links.keys(): + for l in links[k]: + print("%s\t%s"%(k,l)) +except Exception as e: + if str(e)=='Unexpected EOF': + print("%s:\t%s"%(datetime.now().isoformat(),e),file=sys.stderr) + else: + print("%s: "%(datetime.now().isoformat()),end='',file=sys.stderr) + traceback.print_exc(file=sys.stderr) +