Mercurial > hg > cc > azure
comparison workers/bin/test1.sh @ 8:5db6015689a2
slightly updated, 4 W only, versions of old test4.sh and subcount.sh
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Mon, 01 Oct 2018 18:29:39 +0000 |
parents | |
children | 2fbefb8d1a9e |
comparison
equal
deleted
inserted
replaced
7:a7637c994964 | 8:5db6015689a2 |
---|---|
1 #!/bin/bash | |
2 # Test script to split CC WAT files across threads | |
3 # to count http: vs. https: | |
4 # Usage: [echo file file_id] | test1.sh id home | |
5 # remove >>errs once tested | |
6 #set -e -o pipefail | |
7 echo $$ > test1.pid | |
8 proc=$1 | |
9 res=res$proc | |
10 home=$2 | |
11 touch .running | |
12 function lrand { | |
13 # cheap bad little random number generator | |
14 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | |
15 } | |
16 function tryRead { | |
17 m=0 | |
18 set -o pipefail | |
19 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ | |
20 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ | |
21 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 | |
22 do | |
23 # try to avoid lockstep retries | |
24 echo ${PIPESTATUS[@]} 1>&2 | |
25 sleep $(lrand 10) | |
26 echo $(date) $2 retry number $m 1>&2 | |
27 done | |
28 set +o pipefail | |
29 } | |
30 trap "{ | |
31 #set -e -o pipefail | |
32 cd $res | |
33 ln -s ../nohup.cc . | |
34 tar -czhf - * | \ | |
35 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data/jobs/ | |
36 mkdir -p test1.$proc | |
37 cd test1.$proc | |
38 tar -xzf - ; } 2>>errs.1\" | |
39 cd | |
40 rm -rf $res | |
41 ( sleep 5 ; rm nohup.cc ) & | |
42 }" EXIT | |
43 mkdir -p $res | |
44 log=$res/log | |
45 # Don't all start at once | |
46 sleep $(lrand 60) | |
47 echo \# $(date) > $log | |
48 pRes=0 | |
49 while read s id | |
50 do | |
51 url="https://commoncrawl.s3.amazonaws.com/$s" | |
52 export ID=$id | |
53 echo $(date) "running |$@|$id|" >> $log | |
54 # Experimental retry loop | |
55 tryRead "$url" crawl$id | |
56 if [ -s crawl$id ] | |
57 then | |
58 echo \# $id $(wc -l crawl$id) >> $log | |
59 parallel --round-robin --pipe -j 4 "count1.sh >> $res/{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | |
60 else | |
61 echo "crawl$id empty" 1>&2 | |
62 fi | |
63 rm crawl$id | |
64 done < ifile.txt 2>> $res/errs || pRes=$? | |
65 ( cd $res && fgrep -h -v \# [1-9]* ) | sort | uniq -c | tr -d \" > $res/tots | |
66 echo \# $(date) main loop exit code=$pRes >> $log | |
67 rm .running |