Mercurial > hg > cc > cirrus_home
changeset 9:7a93e190c74d
logging tweaks, preparing for timeout on problem pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 25 Feb 2020 10:34:41 +0000 |
parents | 3b56c2c9d0ee |
children | a33db8e3f51c |
files | bin/doPlinks.sh bin/plinks.py bin/plinks.sh |
diffstat | 3 files changed, 3 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/doPlinks.sh Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/doPlinks.sh Tue Feb 25 10:34:41 2020 +0000 @@ -11,8 +11,7 @@ plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; } echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_* -echo $(date) $(pwd) rm $(ls -lt badpdfs_*) -echo . . . $(ls -lt links_*_* | tee >(tail -1 1>&2) | head -1) 1>&2 +echo $(date) $(pwd) rm $(ls -lt badpdfs_*) 1>&2 rm * cd .. echo $(date) $(pwd) rmdir ${tfn} 1>&2
--- a/bin/plinks.py Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/plinks.py Tue Feb 25 10:34:41 2020 +0000 @@ -5,7 +5,7 @@ def run(file): global pdf - pdf=pdfx.PDFx(file) + pdf=pdfx.PDFx(file,limit=30) return pdf.get_references_as_dict() tarnum=sys.argv[1]
--- a/bin/plinks.sh Mon Feb 24 12:16:10 2020 +0000 +++ b/bin/plinks.sh Tue Feb 25 10:34:41 2020 +0000 @@ -4,5 +4,5 @@ h=$(hostname) hn=${h##*n} if [ $hn -eq 0 ]; then echo {005..008}; else echo {009..012}; fi |\ -tr ' ' '\012' |parallel --will-cite -j 4 -N 1 doPlinks.sh ${hn} '{#}' '{}' +tr ' ' '\012' |parallel --will-cite -n 1 doPlinks.sh ${hn} '{#}' '{}' echo $(date) $(hostname) $?