Mercurial > hg > cc > cirrus_home
comparison bin/reExtract.sh @ 42:8ff97ea0ba2c
refactored, not tested
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 03 Apr 2020 19:04:06 +0100 |
parents | 64227ff87e4e |
children | 4b574613200c |
comparison
equal
deleted
inserted
replaced
41:64227ff87e4e | 42:8ff97ea0ba2c |
---|---|
24 do | 24 do |
25 mkdir -p $s/logs | 25 mkdir -p $s/logs |
26 cd $s | 26 cd $s |
27 # Sigh, shouldn't have used this in the extraction ... | 27 # Sigh, shouldn't have used this in the extraction ... |
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ | 28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
29 cut -f 2 -d / cut -f 1 -d _) | 29 cut -f 2 -d / |cut -f 1 -d _) |
30 egrep "\b$s\b" in.txt |cut -f 1,4 | \ | 30 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ |
31 while read e i | 31 while read e # this could be parallel |
32 do | 32 do |
33 id=${p}-00$i | 33 egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ |
34 lf=logs/${jobid}_${i}_log | 34 while read i # this could be parallel |
35 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | 35 do |
36 done | 36 id=${p#CC-MAIN-*}-00$i |
37 lf=logs/${jobid}_${i}_log | |
38 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | |
39 done | |
40 # now compare (logs? ls vs. tar -tvf ?) | |
41 done | |
42 cd .. | |
37 done | 43 done |