comparison bin/doPlinks.sh @ 6:0f494c76a887

refactor to address tarred-up pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 23 Feb 2020 16:48:34 +0000
parents a4b0359456bc
children 25ca3505b4d7
comparison
equal deleted inserted replaced
5:a28d731977da 6:0f494c76a887
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 mkdir -p /dev/shm/x 2 hn=$1
3 plinks.py $1 3 jn=$2
4 # while read f 4 tfn=$3
5 # do 5
6 # if plinks.py $f > /dev/shm/x/links_${me}_${mine} 2>/dev/null 6 mkdir -p /dev/shm/x$hn/${tfn}
7 # then 7 cd /dev/shm/x$hn/${tfn}
8 # ((mine+=1)) 8 tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf'
9 # else 9 echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn
10 # echo $f >> /dev/shm/x/badpdfs_$me 10 ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
11 # rm -f /dev/shm/x/links_${me}_${mine} 11 plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; }
12 # fi 12 echo $(date) $hn tarring results from job $jn for $tfn in $(pwd)
13 # done 13 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_*
14 rsync -a /dev/shm/x/ links 14 rm *
15 cd ..
16 rmdir ${tfn}
17 echo $(date) $hn finished job ${jn} for ${tfn}