# HG changeset patch # User Henry S. Thompson # Date 1582626881 0 # Node ID 7a93e190c74df3b37ba7f736e92f2690633c2e49 # Parent 3b56c2c9d0eed03ac94f39fae334fb10197d908a logging tweaks, preparing for timeout on problem pdfs diff -r 3b56c2c9d0ee -r 7a93e190c74d bin/doPlinks.sh --- a/bin/doPlinks.sh Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/doPlinks.sh Tue Feb 25 10:34:41 2020 +0000 @@ -11,8 +11,7 @@ plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; } echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_* -echo $(date) $(pwd) rm $(ls -lt badpdfs_*) -echo . . . $(ls -lt links_*_* | tee >(tail -1 1>&2) | head -1) 1>&2 +echo $(date) $(pwd) rm $(ls -lt badpdfs_*) 1>&2 rm * cd .. echo $(date) $(pwd) rmdir ${tfn} 1>&2 diff -r 3b56c2c9d0ee -r 7a93e190c74d bin/plinks.py --- a/bin/plinks.py Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/plinks.py Tue Feb 25 10:34:41 2020 +0000 @@ -5,7 +5,7 @@ def run(file): global pdf - pdf=pdfx.PDFx(file) + pdf=pdfx.PDFx(file,limit=30) return pdf.get_references_as_dict() tarnum=sys.argv[1] diff -r 3b56c2c9d0ee -r 7a93e190c74d bin/plinks.sh --- a/bin/plinks.sh Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/plinks.sh Tue Feb 25 10:34:41 2020 +0000 @@ -4,5 +4,5 @@ h=$(hostname) hn=${h##*n} if [ $hn -eq 0 ]; then echo {005..008}; else echo {009..012}; fi |\ -tr ' ' '\012' |parallel --will-cite -j 4 -N 1 doPlinks.sh ${hn} '{#}' '{}' +tr ' ' '\012' |parallel --will-cite -n 1 doPlinks.sh ${hn} '{#}' '{}' echo $(date) $(hostname) $?