changeset 9:7a93e190c74d

logging tweaks, preparing for timeout on problem pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 25 Feb 2020 10:34:41 +0000
parents 3b56c2c9d0ee
children a33db8e3f51c
files bin/doPlinks.sh bin/plinks.py bin/plinks.sh
diffstat 3 files changed, 3 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/doPlinks.sh	Mon Feb 24 12:16:10 2020 +0000
+++ b/bin/doPlinks.sh	Tue Feb 25 10:34:41 2020 +0000
@@ -11,8 +11,7 @@
 plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; }
 echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2
 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_*
-echo $(date) $(pwd) rm $(ls -lt badpdfs_*)
-echo . . .  $(ls -lt links_*_* | tee >(tail -1 1>&2) | head -1) 1>&2
+echo $(date) $(pwd) rm $(ls -lt badpdfs_*) 1>&2
 rm *
 cd ..
 echo $(date) $(pwd) rmdir ${tfn} 1>&2
--- a/bin/plinks.py	Mon Feb 24 12:16:10 2020 +0000
+++ b/bin/plinks.py	Tue Feb 25 10:34:41 2020 +0000
@@ -5,7 +5,7 @@
 
 def run(file):
   global pdf
-  pdf=pdfx.PDFx(file)
+  pdf=pdfx.PDFx(file,limit=30)
   return pdf.get_references_as_dict()
 
 tarnum=sys.argv[1]
--- a/bin/plinks.sh	Mon Feb 24 12:16:10 2020 +0000
+++ b/bin/plinks.sh	Tue Feb 25 10:34:41 2020 +0000
@@ -4,5 +4,5 @@
 h=$(hostname)
 hn=${h##*n}
 if [ $hn -eq 0 ]; then echo {005..008}; else echo {009..012}; fi |\
-tr ' ' '\012' |parallel --will-cite -j 4 -N 1 doPlinks.sh ${hn} '{#}' '{}'
+tr ' ' '\012' |parallel --will-cite -n 1 doPlinks.sh ${hn} '{#}' '{}'
 echo $(date) $(hostname) $?