Mercurial > hg > cc > cirrus_home
annotate bin/reExtract.sh @ 43:4b574613200c
added computation of required additions to tar file, but not actually added
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 04 Apr 2020 15:31:58 +0100 |
parents | 8ff97ea0ba2c |
children | abc1b05996c9 |
rev | line source |
---|---|
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Usage: reExtract.sh 20..-.. < files... |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Input is list of paths to relative path of warc files |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # under /beegfs/common_crawl/CC-MAIN-$1 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 |
43
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
6 function sus () |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
7 { |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
8 sort "$@" | uniq -c | sort -k1nr,1 |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
9 } |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
10 |
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 function edex () { |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 echo "$1" | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 } |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 |
43
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
17 function join_by () { |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
18 # Courtesy of https://stackoverflow.com/a/17841619/2595465 |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
19 local d=$1 |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
20 shift |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
21 echo -n "$1" |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
22 shift |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
23 printf "%s" "${@/#/$d}" |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
24 } |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
25 |
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 h=/beegfs/common_crawl/CC-MAIN-$1 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 mkdir -p /dev/shm/rex |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 cd /dev/shm/rex |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 while read s p i |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 do |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i |
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 done > in.txt |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
36 cut -f 2,3 in.txt| sort -u | \ |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
37 while read s p |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
38 do |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
39 mkdir -p $s/logs |
39
822cfbf134d3
towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 cd $s |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
41 # Sigh, shouldn't have used this in the extraction ... |
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
42 | 43 cut -f 2 -d / |cut -f 1 -d _) |
44 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ | |
45 while read e # this could be parallel | |
46 do | |
43
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
47 lsf=lsl${e}.txt |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
48 rm -f $lsf |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
49 set -f |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
50 ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \ |
42 | 51 while read i # this could be parallel |
52 do | |
43
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
53 set +f |
42 | 54 id=${p#CC-MAIN-*}-00$i |
55 lf=logs/${jobid}_${i}_log | |
56 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | |
43
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
57 ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
58 set -f |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
59 echo "*-00${i}_*" |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
60 done)) |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
61 # now compare ls vs. tar |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
62 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
63 tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
64 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
65 cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
66 ni=${#ii[@]} |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
67 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
68 then |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
69 echo "extra lines in ${e}_check.txt" 1>&2 |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
70 break |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
71 fi |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
72 if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
73 then |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
74 echo "non-addition lines in ${e}_check.txt" 1>&2 |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
75 break |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
76 fi |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
77 egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt |
4b574613200c
added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
42
diff
changeset
|
78 set +f |
42 | 79 done |
80 cd .. | |
41
64227ff87e4e
done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
40
diff
changeset
|
81 done |