Mercurial > hg > cc > cirrus_home
comparison bin/reExtract.sh @ 41:64227ff87e4e
done through re-extraction, fixing tars still to come
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 03 Apr 2020 17:35:17 +0100 |
parents | 0d1f6c971d5e |
children | 8ff97ea0ba2c |
comparison
equal
deleted
inserted
replaced
40:0d1f6c971d5e | 41:64227ff87e4e |
---|---|
17 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ | 17 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ |
18 while read s p i | 18 while read s p i |
19 do | 19 do |
20 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i | 20 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i |
21 done > in.txt | 21 done > in.txt |
22 for s in $(cut -f 2 in.txt| sort -u) | 22 cut -f 2,3 in.txt| sort -u | \ |
23 do | 23 while read s p |
24 mkdir -p $s | 24 do |
25 mkdir -p $s/logs | |
25 cd $s | 26 cd $s |
26 for e in $(egrep "\b$s\b" in.txt |cut -f 1 | sort -u) | 27 # Sigh, shouldn't have used this in the extraction ... |
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ | |
29 cut -f 2 -d / cut -f 1 -d _) | |
30 egrep "\b$s\b" in.txt |cut -f 1,4 | \ | |
31 while read e i | |
27 do | 32 do |
28 tar -xf $h/$s/extract_$e.tar [files for p i in $(egrep "^$e\s$s\b" in.txt | cut -f 3,4) | 33 id=${p}-00$i |
29 | 34 lf=logs/${jobid}_${i}_log |
30 | 35 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf |
36 done | |
37 done |