Mercurial > hg > cc > cirrus_home
annotate bin/reExtract.sh @ 42:8ff97ea0ba2c
refactored, not tested
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 03 Apr 2020 19:04:06 +0100 |
parents | 64227ff87e4e |
children | 4b574613200c |
rev | line source |
---|---|
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Usage: reExtract.sh 20..-.. < files... |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Input is list of paths to relative path of warc files |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # under /beegfs/common_crawl/CC-MAIN-$1 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 function edex () { |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 echo "$1" | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 } |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 h=/beegfs/common_crawl/CC-MAIN-$1 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 mkdir -p /dev/shm/rex |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 cd /dev/shm/rex |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 while read s p i |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 do |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 done > in.txt |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
22 cut -f 2,3 in.txt| sort -u | \ |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
23 while read s p |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
24 do |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
25 mkdir -p $s/logs |
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 cd $s |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
27 # Sigh, shouldn't have used this in the extraction ... |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
42 | 29 cut -f 2 -d / |cut -f 1 -d _) |
30 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ | |
31 while read e # this could be parallel | |
32 do | |
33 egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ | |
34 while read i # this could be parallel | |
35 do | |
36 id=${p#CC-MAIN-*}-00$i | |
37 lf=logs/${jobid}_${i}_log | |
38 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | |
39 done | |
40 # now compare (logs? ls vs. tar -tvf ?) | |
41 done | |
42 cd .. | |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
43 done |