comparison workers/bin/findPDFs.sh @ 12:be1034183e03

scan WAT files for application/pdf responses
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 10 Oct 2018 11:28:21 +0000
parents
children bba589cab837
comparison
equal deleted inserted replaced
11:36b5d379909a 12:be1034183e03
1 #!/bin/bash
2 # Test script to split CC WAT files across threads
3 # to count find application/pdf responses:
4 # Usage: [echo file file_id] | findPDFs id home [-t] numWorkerProcesses
5 # If -t, no random wait, just id seconds
6 # remove >>errs once tested
7 #set -e -o pipefail
8 echo $$ > test1.pid
9 proc=$1
10 res=res
11 home=$2
12 shift 2
13 if [ "$1" = "-t" ]
14 then
15 shift
16 pause=$proc
17 else
18 pause=$(lrand 60)
19 fi
20 wp=$1
21 touch .running
22 function lrand {
23 # cheap bad little random number generator
24 echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
25 }
26 function tryRead {
27 m=0
28 set -o pipefail
29 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
30 curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
31 { egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
32 do
33 # try to avoid lockstep retries
34 echo ${PIPESTATUS[@]} 1>&2
35 sleep $(lrand 10)
36 echo $(date) $2 retry number $m 1>&2
37 done
38 set +o pipefail
39 }
40 trap "{
41 #set -e -o pipefail
42 cd $res
43 ln -s ../nohup.cc .
44 tar -czhf - * | \
45 ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
46 mkdir -p pdf/wat
47 cd pdf/wat
48 tar -xzf - ; } 2>>errs\"
49 cd
50 rm -rf $res
51 ( sleep 5 ; rm nohup.cc ) &
52 }" EXIT
53 mkdir -p $res
54 log=$res/log
55 # Don't all start at once
56 sleep $pause
57 echo \# $(date) > $log
58 pRes=0
59 while read s id
60 do
61 url="https://commoncrawl.s3.amazonaws.com/$s"
62 ccm=${s##*/wat/}
63 cci=${ccm%%-ip*}
64 export ID=$id
65 echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
66 # Experimental retry loop
67 tryRead "$url" crawl$id
68 if [ -s crawl$id ]
69 then
70 echo \# $(date) $id $(wc -l crawl$id) >> $log
71 parallel --round-robin --pipe -j $wp "_findPDFs.sh {#} >> $res/$cci 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
72 else
73 echo "crawl$id empty" 1>&2
74 fi
75 rm crawl$id
76 done < ifile.txt 2>> $res/errs || pRes=$?
77 echo \# $(date) main loop exit code=$pRes >> $log
78 rm .running
79