Mercurial > hg > cc > cirrus_home
diff bin/reExtract.sh @ 42:8ff97ea0ba2c
refactored, not tested
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 03 Apr 2020 19:04:06 +0100 |
parents | 64227ff87e4e |
children | 4b574613200c |
line wrap: on
line diff
--- a/bin/reExtract.sh Fri Apr 03 17:35:17 2020 +0100 +++ b/bin/reExtract.sh Fri Apr 03 19:04:06 2020 +0100 @@ -26,12 +26,18 @@ cd $s # Sigh, shouldn't have used this in the extraction ... jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ - cut -f 2 -d / cut -f 1 -d _) - egrep "\b$s\b" in.txt |cut -f 1,4 | \ - while read e i - do - id=${p}-00$i - lf=logs/${jobid}_${i}_log - unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf - done + cut -f 2 -d / |cut -f 1 -d _) + egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ + while read e # this could be parallel + do + egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ + while read i # this could be parallel + do + id=${p#CC-MAIN-*}-00$i + lf=logs/${jobid}_${i}_log + unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + done + # now compare (logs? ls vs. tar -tvf ?) + done + cd .. done