annotate bin/reExtract.sh @ 43:4b574613200c

added computation of required additions to tar file, but not actually added
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 04 Apr 2020 15:31:58 +0100
parents 8ff97ea0ba2c
children abc1b05996c9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: reExtract.sh 20..-.. < files...
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Input is list of paths to relative path of warc files
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # under /beegfs/common_crawl/CC-MAIN-$1
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
43
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
6 function sus ()
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
7 {
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
8 sort "$@" | uniq -c | sort -k1nr,1
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
9 }
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
10
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 function edex () {
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 echo "$1" | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 }
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
43
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
17 function join_by () {
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
18 # Courtesy of https://stackoverflow.com/a/17841619/2595465
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
19 local d=$1
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
20 shift
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
21 echo -n "$1"
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
22 shift
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
23 printf "%s" "${@/#/$d}"
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
24 }
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
25
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 h=/beegfs/common_crawl/CC-MAIN-$1
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 mkdir -p /dev/shm/rex
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 cd /dev/shm/rex
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 while read s p i
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 do
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 done > in.txt
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
36 cut -f 2,3 in.txt| sort -u | \
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
37 while read s p
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
38 do
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
39 mkdir -p $s/logs
39
822cfbf134d3 towards re-running extraction in part
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 cd $s
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
41 # Sigh, shouldn't have used this in the extraction ...
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
42
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
43 cut -f 2 -d / |cut -f 1 -d _)
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
44 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
45 while read e # this could be parallel
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
46 do
43
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
47 lsf=lsl${e}.txt
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
48 rm -f $lsf
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
49 set -f
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
50 ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \
42
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
51 while read i # this could be parallel
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
52 do
43
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
53 set +f
42
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
54 id=${p#CC-MAIN-*}-00$i
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
55 lf=logs/${jobid}_${i}_log
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
56 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
43
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
57 ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
58 set -f
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
59 echo "*-00${i}_*"
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
60 done))
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
61 # now compare ls vs. tar
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
62 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
63 tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
64 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
65 cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
66 ni=${#ii[@]}
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
67 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
68 then
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
69 echo "extra lines in ${e}_check.txt" 1>&2
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
70 break
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
71 fi
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
72 if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
73 then
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
74 echo "non-addition lines in ${e}_check.txt" 1>&2
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
75 break
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
76 fi
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
77 egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt
4b574613200c added computation of required additions to tar file, but not actually added
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 42
diff changeset
78 set +f
42
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
79 done
8ff97ea0ba2c refactored, not tested
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 41
diff changeset
80 cd ..
41
64227ff87e4e done through re-extraction, fixing tars still to come
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 40
diff changeset
81 done