comparison bin/reExtract.sh @ 42:8ff97ea0ba2c

refactored, not tested
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 03 Apr 2020 19:04:06 +0100
parents 64227ff87e4e
children 4b574613200c
comparison
equal deleted inserted replaced
41:64227ff87e4e 42:8ff97ea0ba2c
24 do 24 do
25 mkdir -p $s/logs 25 mkdir -p $s/logs
26 cd $s 26 cd $s
27 # Sigh, shouldn't have used this in the extraction ... 27 # Sigh, shouldn't have used this in the extraction ...
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ 28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
29 cut -f 2 -d / cut -f 1 -d _) 29 cut -f 2 -d / |cut -f 1 -d _)
30 egrep "\b$s\b" in.txt |cut -f 1,4 | \ 30 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
31 while read e i 31 while read e # this could be parallel
32 do 32 do
33 id=${p}-00$i 33 egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \
34 lf=logs/${jobid}_${i}_log 34 while read i # this could be parallel
35 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf 35 do
36 done 36 id=${p#CC-MAIN-*}-00$i
37 lf=logs/${jobid}_${i}_log
38 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
39 done
40 # now compare (logs? ls vs. tar -tvf ?)
41 done
42 cd ..
37 done 43 done