annotate workers/bin/timedWhich.sh @ 33:4c117ee8ed75

fixDates, _fixAndMerge, _doFetch towards rework of date fixup share.sh, old_invoke.sh recover the old approach to sharing, which works
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 20 Nov 2018 14:49:07 +0000
parents 60d4042dab26
children c2b72d29a3ee
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 # Test script to split CC WAT files across threads
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 # to tabulate http vs. https by last-modified date:
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 # Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 # If -t, no random wait, just id seconds
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 # remove >>errs once tested
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 #set -e -o pipefail
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 echo $$ > test1.pid
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 proc=$1
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
10 res=/var/data/res$proc
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 home=$2
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12 shift 2
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
13 function lrand {
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
14 # cheap bad little random number generator
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 }
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 if [ "$1" = "-t" ]
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 then
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 shift
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 pause=$proc
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21 else
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 pause=$(lrand 60)
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 fi
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24 wp=$1
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25 touch .running
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 function tryRead {
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
27 m=0
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 set -o pipefail
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 do
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 # try to avoid lockstep retries
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 echo ${PIPESTATUS[@]} 1>&2
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 sleep $(lrand 10)
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
36 echo $(date) retry number $m 1>&2
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 done
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 set +o pipefail
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39 }
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
40 trap "{
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
41 set -e -o pipefail
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
42 cd /var/data
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
43 tar -czhf - CC* res* | \
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
18
9631fca89cc6 F2-related stuff, and new experiment
Henry S. Thompson <ht@markup.co.uk>
parents: 17
diff changeset
45 mkdir -p which
9631fca89cc6 F2-related stuff, and new experiment
Henry S. Thompson <ht@markup.co.uk>
parents: 17
diff changeset
46 cd which
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
47 tar -xzf - ; } 2>>errs\"
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
48 rm -rf res* CC*
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
49 cd
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
50 rm ifile.txt *.pid
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51 ( sleep 5 ; rm nohup.cc ) &
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
52 }" EXIT
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
53 mkdir -p $res
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
54 log=$res/log
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 # Don't all start at once
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 sleep $pause
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 echo \# $(date) > $log
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 pRes=0
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
59 doit ()
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
60 {
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
61 echo -n "# $(date) $id " >> $log
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
62 tee >(wc -l >> $log) |\
22
60d4042dab26 > for >>
Henry S. Thompson <ht@markup.co.uk>
parents: 21
diff changeset
63 parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} > $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
64 }
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
65 while read s id
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
66 do
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 url="https://commoncrawl.s3.amazonaws.com/$s"
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
68 cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
69 export ID=$id
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
70 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
71 # Experimental retry loop
21
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
72 tryRead "$url"
a214fd3a8001 use /var/data, don't store but include subproc _inside_ tryread
Henry S. Thompson <ht@markup.co.uk>
parents: 18
diff changeset
73 cat $res/$cci.* > /var/data/$cci
17
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
74 done < ifile.txt 2>> $res/errs || pRes=$?
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
75 echo \# $(date) main loop exit code=$pRes >> $log
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
76 rm .running
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
77
2a2c1fb03c54 first cut at http/https real trial, with month and year last-modified info too
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
78