annotate bin/preExtract.sh @ 49:18f8bcc779e8

as running, modulo 1 log output wrong
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 10 Apr 2020 18:42:08 +0100
parents 307e0c44925a
children 5de261eb0deb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: reExtract.sh 20..-.. < files...
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Input is list of paths to relative path of warc files
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # under /beegfs/common_crawl/CC-MAIN-$1
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 function sus ()
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 {
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 sort "$@" | uniq -c | sort -k1nr,1
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 }
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 function edex () {
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1))
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 }
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 function join_by () {
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 # Courtesy of https://stackoverflow.com/a/17841619/2595465
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 local d=$1
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 shift
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 echo -n "$1"
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 shift
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 printf "%s" "${@/#/$d}"
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 }
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 h=/beegfs/common_crawl/CC-MAIN-$1
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 mkdir -p /dev/shm/rex
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 cd /dev/shm/rex
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 sed 's/-00/ /;s/.warc.gz//' | \
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 while read s p i
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 do
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 done > in.txt
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 parallel --will-cite -j 16 -N 2 h="$h"'
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; }
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 s={1}
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 p={2}
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 mkdir -p $s/logs
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 cd $s
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
41 echo $(date) starting $s/$p > log
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 # Sigh, should not have used this in the extraction ...
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 cut -f 2 -d / |cut -f 1 -d _)
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 # this could be parallel
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 do
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
48 echo $(date) begin extract: $e >> log
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 lsf=lsl${e}.txt
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 rm -f $lsf
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 lff=()
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 ii=()
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4)
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 # this could be parallel
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 do
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 id=${p#CC-MAIN-*}-00$i
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
57 echo " " "$id" >> log
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 lf=logs/${jobid}_${i}_log
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 lff+=("${lf}") # accumulate list of log files
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
60 if [ -s $lf ]
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
61 then
49
18f8bcc779e8 as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 48
diff changeset
62 echo " " $lf not empty, skipping extraction >> log
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
63 else
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
64 echo " " extracting from $id >> log
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
65 echo starting ${id} $(date) > $lf
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
67 echo finished ${id} $(date) >> $lf
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
68 fi
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 ii+=("*-00${i}_*")
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
71 echo " " "$i" "${ii[@]}" ${#ii[@]} >> log
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 done
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
73 echo " " extractions done
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 # now compare ls vs. tar
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
75 echo " " "${ii[@]}" ${#ii[@]} >> log
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
76 echo " " lff "${lff[@]}" >> log
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
80 if [ -s ${e}_diff.txt ]
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
81 then
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
82 echo " " checking...
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
84 ni=${#ii[@]}
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
86 then
49
18f8bcc779e8 as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 48
diff changeset
87 if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \
18f8bcc779e8 as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 48
diff changeset
88 ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]]
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
89 then
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
90 :
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
91 else
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
92 echo " " "extra lines in ${e}_check.txt" >> log
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
93 cd ..
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
94 continue
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
95 fi
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
96 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
97 then
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
98 echo " " "non-addition lines in ${e}_check.txt" >> log
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
99 cd ..
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
100 continue
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
101 fi
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
102 echo " " starting tar update
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
103 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
104 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
105 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
106 else
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
107 echo "no diff, no update" $e >> log
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
108 fi
48
307e0c44925a log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 46
diff changeset
109 echo end extract: $e >> log
46
2e5b3439a2ed start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 45
diff changeset
110 done
45
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111 cd ..
bd0010ac88ce parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 '