Mercurial > hg > cc > azure
annotate workers/bin/findPDFs.sh @ 13:bba589cab837
shrinkJSON.sh: minimise "jq ." output
test1.sh: fix lrand regression
[_]findPDFs.sh: extract responses with application/pdf Content-Type
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 12 Oct 2018 08:51:50 +0000 |
parents | be1034183e03 |
children |
rev | line source |
---|---|
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 # Test script to split CC WAT files across threads |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 # to count find application/pdf responses: |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 # Usage: [echo file file_id] | findPDFs id home [-t] numWorkerProcesses |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 # If -t, no random wait, just id seconds |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 # remove >>errs once tested |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 #set -e -o pipefail |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 echo $$ > test1.pid |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 proc=$1 |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
10 res=res$proc |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 home=$2 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 shift 2 |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
13 function lrand { |
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
14 # cheap bad little random number generator |
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) |
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
16 } |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 if [ "$1" = "-t" ] |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 then |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 shift |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 pause=$proc |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 else |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 pause=$(lrand 60) |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 fi |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 wp=$1 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 touch .running |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 function tryRead { |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 m=0 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 set -o pipefail |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 do |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 # try to avoid lockstep retries |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 echo ${PIPESTATUS[@]} 1>&2 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 sleep $(lrand 10) |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 echo $(date) $2 retry number $m 1>&2 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 done |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 set +o pipefail |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 } |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 trap "{ |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 #set -e -o pipefail |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 ln -s ../nohup.cc . |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
43 tar -czhf - CC* $res | \ |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 mkdir -p pdf/wat |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 cd pdf/wat |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 tar -xzf - ; } 2>>errs\" |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
48 rm -rf $res CC* ifile.txt *.pid |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 ( sleep 5 ; rm nohup.cc ) & |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 }" EXIT |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 mkdir -p $res |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 log=$res/log |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 # Don't all start at once |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 sleep $pause |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 echo \# $(date) > $log |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 pRes=0 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 while read s id |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 do |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 url="https://commoncrawl.s3.amazonaws.com/$s" |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 ccm=${s##*/wat/} |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 cci=${ccm%%-ip*} |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 export ID=$id |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
64 # Experimental retry loop |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 tryRead "$url" crawl$id |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 if [ -s crawl$id ] |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
67 then |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
68 echo \# $(date) $id $(wc -l crawl$id) >> $log |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
69 parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
70 else |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
71 echo "crawl$id empty" 1>&2 |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
72 fi |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
73 rm crawl$id |
13
bba589cab837
shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents:
12
diff
changeset
|
74 cat $res/$cci.* > $cci |
12
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
75 done < ifile.txt 2>> $res/errs || pRes=$? |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
76 echo \# $(date) main loop exit code=$pRes >> $log |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
77 rm .running |
be1034183e03
scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
78 |