Mercurial > hg > cc > cirrus_home
annotate bin/preExtract.sh @ 49:18f8bcc779e8
as running, modulo 1 log output wrong
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 10 Apr 2020 18:42:08 +0100 |
parents | 307e0c44925a |
children | 5de261eb0deb |
rev | line source |
---|---|
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Usage: reExtract.sh 20..-.. < files... |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Input is list of paths to relative path of warc files |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 # under /beegfs/common_crawl/CC-MAIN-$1 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 function sus () |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 { |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 sort "$@" | uniq -c | sort -k1nr,1 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 } |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 function edex () { |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1)) |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 } |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 function join_by () { |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 # Courtesy of https://stackoverflow.com/a/17841619/2595465 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 local d=$1 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 shift |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 echo -n "$1" |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 shift |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 printf "%s" "${@/#/$d}" |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 } |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 h=/beegfs/common_crawl/CC-MAIN-$1 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 mkdir -p /dev/shm/rex |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 cd /dev/shm/rex |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 sed 's/-00/ /;s/.warc.gz//' | \ |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 while read s p i |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 do |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 done > in.txt |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\ |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 parallel --will-cite -j 16 -N 2 h="$h"' |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 s={1} |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 p={2} |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 mkdir -p $s/logs |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 cd $s |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
41 echo $(date) starting $s/$p > log |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 # Sigh, should not have used this in the extraction ... |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 cut -f 2 -d / |cut -f 1 -d _) |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 # this could be parallel |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 do |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
48 echo $(date) begin extract: $e >> log |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 lsf=lsl${e}.txt |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 rm -f $lsf |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 lff=() |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 ii=() |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 # this could be parallel |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 do |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
56 id=${p#CC-MAIN-*}-00$i |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
57 echo " " "$id" >> log |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
58 lf=logs/${jobid}_${i}_log |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
59 lff+=("${lf}") # accumulate list of log files |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
60 if [ -s $lf ] |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
61 then |
49
18f8bcc779e8
as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
62 echo " " $lf not empty, skipping extraction >> log |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
63 else |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
64 echo " " extracting from $id >> log |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
65 echo starting ${id} $(date) > $lf |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
67 echo finished ${id} $(date) >> $lf |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
68 fi |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
69 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
70 ii+=("*-00${i}_*") |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
71 echo " " "$i" "${ii[@]}" ${#ii[@]} >> log |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
72 done |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
73 echo " " extractions done |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
74 # now compare ls vs. tar |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
75 echo " " "${ii[@]}" ${#ii[@]} >> log |
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
76 echo " " lff "${lff[@]}" >> log |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
77 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
78 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
79 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
80 if [ -s ${e}_diff.txt ] |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
81 then |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
82 echo " " checking... |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
84 ni=${#ii[@]} |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
86 then |
49
18f8bcc779e8
as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
87 if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \ |
18f8bcc779e8
as running, modulo 1 log output wrong
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
48
diff
changeset
|
88 ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]] |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
89 then |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
90 : |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
91 else |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
92 echo " " "extra lines in ${e}_check.txt" >> log |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
93 cd .. |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
94 continue |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
95 fi |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
96 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
97 then |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
98 echo " " "non-addition lines in ${e}_check.txt" >> log |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
99 cd .. |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
100 continue |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
101 fi |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
102 echo " " starting tar update |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
103 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
104 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
105 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" |
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
106 else |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
107 echo "no diff, no update" $e >> log |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
108 fi |
48
307e0c44925a
log more, work around more glitches
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
46
diff
changeset
|
109 echo end extract: $e >> log |
46
2e5b3439a2ed
start try to work around failures
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
45
diff
changeset
|
110 done |
45
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
111 cd .. |
bd0010ac88ce
parallelised version of reExtract.sh
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 ' |