annotate workers/bin/findPDFs.sh @ 13:bba589cab837

shrinkJSON.sh: minimise "jq ." output test1.sh: fix lrand regression [_]findPDFs.sh: extract responses with application/pdf Content-Type
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 12 Oct 2018 08:51:50 +0000
parents be1034183e03
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/bin/bash
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 # Test script to split CC WAT files across threads
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3 # to count find application/pdf responses:
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 # Usage: [echo file file_id] | findPDFs id home [-t] numWorkerProcesses
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 # If -t, no random wait, just id seconds
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6 # remove >>errs once tested
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 #set -e -o pipefail
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8 echo $$ > test1.pid
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 proc=$1
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
10 res=res$proc
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 home=$2
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12 shift 2
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
13 function lrand {
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
14 # cheap bad little random number generator
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
15 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
16 }
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 if [ "$1" = "-t" ]
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 then
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 shift
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 pause=$proc
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21 else
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 pause=$(lrand 60)
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 fi
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24 wp=$1
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25 touch .running
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 function tryRead {
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
27 m=0
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 set -o pipefail
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 do
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 # try to avoid lockstep retries
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 echo ${PIPESTATUS[@]} 1>&2
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 sleep $(lrand 10)
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 echo $(date) $2 retry number $m 1>&2
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 done
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 set +o pipefail
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39 }
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
40 trap "{
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
41 #set -e -o pipefail
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
42 ln -s ../nohup.cc .
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
43 tar -czhf - CC* $res | \
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45 mkdir -p pdf/wat
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 cd pdf/wat
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
47 tar -xzf - ; } 2>>errs\"
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
48 rm -rf $res CC* ifile.txt *.pid
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
49 ( sleep 5 ; rm nohup.cc ) &
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
50 }" EXIT
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51 mkdir -p $res
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
52 log=$res/log
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
53 # Don't all start at once
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
54 sleep $pause
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 echo \# $(date) > $log
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 pRes=0
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 while read s id
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 do
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 url="https://commoncrawl.s3.amazonaws.com/$s"
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60 ccm=${s##*/wat/}
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 cci=${ccm%%-ip*}
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
62 export ID=$id
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
63 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
64 # Experimental retry loop
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
65 tryRead "$url" crawl$id
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
66 if [ -s crawl$id ]
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 then
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
68 echo \# $(date) $id $(wc -l crawl$id) >> $log
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
69 parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
70 else
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
71 echo "crawl$id empty" 1>&2
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
72 fi
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
73 rm crawl$id
13
bba589cab837 shrinkJSON.sh: minimise "jq ." output
Henry S. Thompson <ht@markup.co.uk>
parents: 12
diff changeset
74 cat $res/$cci.* > $cci
12
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
75 done < ifile.txt 2>> $res/errs || pRes=$?
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
76 echo \# $(date) main loop exit code=$pRes >> $log
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
77 rm .running
be1034183e03 scan WAT files for application/pdf responses
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
78