46
|
1 #!/bin/bash
|
|
2 # Test script to split CC WAT files across threads
|
|
3 # to tabulate http vs. https by last-modified date:
|
|
4 # Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses
|
|
5 # If -t, no random wait, just id seconds
|
|
6 # remove >>errs once tested
|
|
7 #set -e -o pipefail
|
|
8 echo $$ > test1.pid
|
|
9 proc=$1
|
|
10 res=/var/data/res$proc
|
|
11 home=$2
|
|
12 shift 2
|
|
13 function lrand {
|
|
14 # cheap bad little random number generator
|
|
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
|
|
16 }
|
|
17 if [ "$1" = "-t" ]
|
|
18 then
|
|
19 shift
|
|
20 pause=$proc
|
|
21 else
|
|
22 pause=$(lrand 60)
|
|
23 fi
|
|
24 wp=$1
|
|
25 touch .running
|
|
26 function tryread {
|
|
27 n=$1
|
|
28 while read u o
|
|
29 do
|
|
30 m=0
|
|
31 set -o pipefail
|
|
32 until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \
|
|
33 curl -s -S --max-time 60 --insecure -o - "$u" | \
|
|
34 { echo "done at " $(date) 1>&2 ; zcat ; } |\
|
|
35 _timedWhich.py > "$o"
|
|
36 do
|
|
37 # try to avoid lockstep retries
|
|
38 echo \# ${PIPESTATUS[@]} 1>&2
|
|
39 sleep $(lrand 10)
|
|
40 echo \# $(date) retry number $m 1>&2
|
|
41 done
|
|
42 set +o pipefail
|
|
43 done
|
|
44 }
|
|
45 trap "{
|
|
46 set -e -o pipefail
|
|
47 cd /var/data
|
|
48 tar -czhf - CC* res* | \
|
|
49 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
|
|
50 mkdir -p which
|
|
51 cd which
|
|
52 tar -xzf - ; } 2>>errs\"
|
|
53 rm -rf res* CC*
|
|
54 cd
|
|
55 rm ifile.txt *.pid
|
|
56 ( sleep 5 ; rm nohup.cc ) &
|
|
57 }" EXIT
|
|
58 mkdir -p $res
|
|
59 log=$res/log
|
|
60 # Don't all start at once
|
|
61 sleep $pause
|
|
62 echo \# $(date) > $log
|
|
63 pRes=0
|
|
64 N=$(wc -l< ifile.txt)
|
|
65 export -f tryread lrand
|
|
66 while read s
|
|
67 do
|
|
68 url="https://commoncrawl.s3.amazonaws.com/$s"
|
|
69 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
|
|
70 echo $url /var/data/$cci
|
|
71 done < ifile.txt 2>> $res/errs | \
|
|
72 parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$?
|
|
73 echo \# $(date) main loop exit code=$pRes >> $log
|
|
74 rm .running
|
|
75
|
|
76
|