changeset 10:a33db8e3f51c

bigger run, longer limit
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 25 Feb 2020 14:56:36 +0000
parents 7a93e190c74d
children b0d9fe66ce8a
files bin/plinks.py bin/plinks.sh
diffstat 2 files changed, 8 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/bin/plinks.py	Tue Feb 25 10:34:41 2020 +0000
+++ b/bin/plinks.py	Tue Feb 25 14:56:36 2020 +0000
@@ -5,8 +5,8 @@
 
 def run(file):
   global pdf
-  pdf=pdfx.PDFx(file,limit=30)
-  return pdf.get_references_as_dict()
+  pdf=pdfx.PDFx(file,limit=60)
+  return (pdf.get_references_as_dict(),pdf.limited)
 
 tarnum=sys.argv[1]
 print(tarnum, sys.argv, os.getcwd(),file=sys.stderr)
@@ -15,7 +15,10 @@
   for l in sys.stdin:
     (fno,f)=l.split()
     try:
-      links=run(f)
+      (links,limited)=run(f)
+      if limited:
+        print("%s\t%s\tProcessing limited after timeout"%(
+          datetime.now().isoformat(),fno),file=bf)
       if bool(links) and (links.get('scrape',False) or
                           links.get('annot',False)):
         gf+=1
--- a/bin/plinks.sh	Tue Feb 25 10:34:41 2020 +0000
+++ b/bin/plinks.sh	Tue Feb 25 14:56:36 2020 +0000
@@ -3,6 +3,6 @@
 echo $(date) $(hostname)
 h=$(hostname)
 hn=${h##*n}
-if [ $hn -eq 0 ]; then echo {005..008}; else echo {009..012}; fi |\
-tr ' ' '\012' |parallel --will-cite -n 1 doPlinks.sh ${hn} '{#}' '{}'
+if [ $hn -eq 0 ]; then echo {013..062}; else echo {063..112}; fi |\
+tr ' ' '\012' |parallel --will-cite -j 30 -N 1 doPlinks.sh ${hn} '{#}' '{}'
 echo $(date) $(hostname) $?