annotate bin/reExtract.sh @ 41:64227ff87e4e

done through re-extraction, fixing tars still to come
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 03 Apr 2020 17:35:17 +0100
parents 0d1f6c971d5e
children 8ff97ea0ba2c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: reExtract.sh 20..-.. < files...
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Input is list of paths to relative path of warc files
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # under /beegfs/common_crawl/CC-MAIN-$1
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 function edex () {
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 echo "$1" | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 }
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 h=/beegfs/common_crawl/CC-MAIN-$1
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 mkdir -p /dev/shm/rex
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 cd /dev/shm/rex
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 while read s p i
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 do
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 done > in.txt
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
22 cut -f 2,3 in.txt| sort -u | \
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
23 while read s p
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
24 do
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
25 mkdir -p $s/logs
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 cd $s
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
27 # Sigh, shouldn't have used this in the extraction ...
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
29 cut -f 2 -d / cut -f 1 -d _)
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
30 egrep "\b$s\b" in.txt |cut -f 1,4 | \
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
31 while read e i
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 do
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
33 id=${p}-00$i
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
34 lf=logs/${jobid}_${i}_log
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
35 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
36 done
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
37 done