Mercurial > hg > cc > azure
comparison workers/bin/findPDFs.sh @ 12:be1034183e03
scan WAT files for application/pdf responses
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 10 Oct 2018 11:28:21 +0000 |
parents | |
children | bba589cab837 |
comparison
equal
deleted
inserted
replaced
11:36b5d379909a | 12:be1034183e03 |
---|---|
1 #!/bin/bash | |
2 # Test script to split CC WAT files across threads | |
3 # to count find application/pdf responses: | |
4 # Usage: [echo file file_id] | findPDFs id home [-t] numWorkerProcesses | |
5 # If -t, no random wait, just id seconds | |
6 # remove >>errs once tested | |
7 #set -e -o pipefail | |
8 echo $$ > test1.pid | |
9 proc=$1 | |
10 res=res | |
11 home=$2 | |
12 shift 2 | |
13 if [ "$1" = "-t" ] | |
14 then | |
15 shift | |
16 pause=$proc | |
17 else | |
18 pause=$(lrand 60) | |
19 fi | |
20 wp=$1 | |
21 touch .running | |
22 function lrand { | |
23 # cheap bad little random number generator | |
24 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) | |
25 } | |
26 function tryRead { | |
27 m=0 | |
28 set -o pipefail | |
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \ | |
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \ | |
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2 | |
32 do | |
33 # try to avoid lockstep retries | |
34 echo ${PIPESTATUS[@]} 1>&2 | |
35 sleep $(lrand 10) | |
36 echo $(date) $2 retry number $m 1>&2 | |
37 done | |
38 set +o pipefail | |
39 } | |
40 trap "{ | |
41 #set -e -o pipefail | |
42 cd $res | |
43 ln -s ../nohup.cc . | |
44 tar -czhf - * | \ | |
45 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data | |
46 mkdir -p pdf/wat | |
47 cd pdf/wat | |
48 tar -xzf - ; } 2>>errs\" | |
49 cd | |
50 rm -rf $res | |
51 ( sleep 5 ; rm nohup.cc ) & | |
52 }" EXIT | |
53 mkdir -p $res | |
54 log=$res/log | |
55 # Don't all start at once | |
56 sleep $pause | |
57 echo \# $(date) > $log | |
58 pRes=0 | |
59 while read s id | |
60 do | |
61 url="https://commoncrawl.s3.amazonaws.com/$s" | |
62 ccm=${s##*/wat/} | |
63 cci=${ccm%%-ip*} | |
64 export ID=$id | |
65 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log | |
66 # Experimental retry loop | |
67 tryRead "$url" crawl$id | |
68 if [ -s crawl$id ] | |
69 then | |
70 echo \# $(date) $id $(wc -l crawl$id) >> $log | |
71 parallel --round-robin --pipe -j $wp "_findPDFs.sh {#} >> $res/$cci 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 | |
72 else | |
73 echo "crawl$id empty" 1>&2 | |
74 fi | |
75 rm crawl$id | |
76 done < ifile.txt 2>> $res/errs || pRes=$? | |
77 echo \# $(date) main loop exit code=$pRes >> $log | |
78 rm .running | |
79 |