# HG changeset patch # User Henry S. Thompson # Date 1585931717 -3600 # Node ID 64227ff87e4ebaa58b95de235639172201f52b6e # Parent 0d1f6c971d5e89669241a0ffe7575346fc94d798 done through re-extraction, fixing tars still to come diff -r 0d1f6c971d5e -r 64227ff87e4e bin/reExtract.sh --- a/bin/reExtract.sh Thu Apr 02 19:21:21 2020 +0100 +++ b/bin/reExtract.sh Fri Apr 03 17:35:17 2020 +0100 @@ -19,12 +19,19 @@ do printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i done > in.txt -for s in $(cut -f 2 in.txt| sort -u) -do - mkdir -p $s +cut -f 2,3 in.txt| sort -u | \ + while read s p + do + mkdir -p $s/logs cd $s - for e in $(egrep "\b$s\b" in.txt |cut -f 1 | sort -u) + # Sigh, shouldn't have used this in the extraction ... + jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ + cut -f 2 -d / cut -f 1 -d _) + egrep "\b$s\b" in.txt |cut -f 1,4 | \ + while read e i do - tar -xf $h/$s/extract_$e.tar [files for p i in $(egrep "^$e\s$s\b" in.txt | cut -f 3,4) - - + id=${p}-00$i + lf=logs/${jobid}_${i}_log + unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + done + done