Mercurial > hg > cc > cirrus_home
changeset 6:0f494c76a887
refactor to address tarred-up pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 23 Feb 2020 16:48:34 +0000 |
parents | a28d731977da |
children | 25ca3505b4d7 |
files | bin/doPlinks.sh bin/plinks.py bin/plinks.sh |
diffstat | 3 files changed, 28 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/doPlinks.sh Wed Feb 19 10:41:59 2020 +0000 +++ b/bin/doPlinks.sh Sun Feb 23 16:48:34 2020 +0000 @@ -1,14 +1,17 @@ #!/usr/bin/bash -mkdir -p /dev/shm/x -plinks.py $1 -# while read f -# do -# if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null -# then -# ((mine+=1)) -# else -# echo $f >> /dev/shm/x/badpdfs_$me -# rm -f /dev/shm/x/links_${me}_${mine} -# fi -# done -rsync -a /dev/shm/x/ links +hn=$1 +jn=$2 +tfn=$3 + +mkdir -p /dev/shm/x$hn/${tfn} +cd /dev/shm/x$hn/${tfn} +tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf' +echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn +ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\ +plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; } +echo $(date) $hn tarring results from job $jn for $tfn in $(pwd) +tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_* +rm * +cd .. +rmdir ${tfn} +echo $(date) $hn finished job ${jn} for ${tfn}
--- a/bin/plinks.py Wed Feb 19 10:41:59 2020 +0000 +++ b/bin/plinks.py Sun Feb 23 16:48:34 2020 +0000 @@ -7,20 +7,24 @@ pdf=pdfx.PDFx(file) return pdf.get_references_as_dict() -me=sys.argv[1] -with open('/dev/shm/x/badpdfs_%s'%me,'w') as bf: +tarnum=sys.argv[1] +with open('badpdfs_%s'%tarnum,'w') as bf: for l in sys.stdin: - (fno,f)=l.rstrip().split() + (fno,f)=l.split() try: links=run(f) if bool(links) and (links.get('scrape',False) or links.get('annot',False)): - with open('/dev/shm/x/links_%s'%fno,'w') as of: + with open('links_%s_%s'%(tarnum,fno),'w') as of: for k in links.keys(): for l in links[k]: print("%s\t%s"%(k,l),file=of) except Exception as e: - print("%s\t%s"%(fno,e),file=bf) + if str(e)=='Unexpected EOF': + print("%s\t%s\t%s"%(tarnum,fno,e),file=bf) + else: + traceback.print_exc(file=bf) + if (path.exists('/dev/shm/stopJob')): - print("Quiting early: %s %s"%(me,fno),file=sys.stderr) + print("Quiting early: %s %s"%(tarnum,fno),file=sys.stderr) exit(1)
--- a/bin/plinks.sh Wed Feb 19 10:41:59 2020 +0000 +++ b/bin/plinks.sh Sun Feb 23 16:48:34 2020 +0000 @@ -1,8 +1,8 @@ #!/usr/bin/bash mkdir -p $TMPDIR echo $(date) $(hostname) -cd /beegfs/common_crawl/CC-MAIN-2019-35/pdfs h=$(hostname) hn=${h##*n} -parallel --will-cite -j 71 --pipepart -a lm/$(hostname)_pdfFilesWithLM doPlinks.sh ${hn}_'{#}' +if [ $hn -eq 0 ]; then echo {001..002}; else echo {003..004}; fi |\ +tr ' ' '\012' |parallel --will-cite -j 71 -N 1 doPlinks.sh ${hn} '{#}' '{}' echo $(date) $(hostname)