changeset 17:2a2c1fb03c54

first cut at http/https real trial, with month and year last-modified info too
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 19 Oct 2018 11:36:31 +0000
parents c3e9ad8a42cf
children 9631fca89cc6
files workers/bin/_timedWhich.sh workers/bin/timedWhich.sh
diffstat 2 files changed, 86 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workers/bin/_timedWhich.sh	Fri Oct 19 11:36:31 2018 +0000
@@ -0,0 +1,7 @@
+#!/bin/bash
+egrep -o'("WARC-Target-URI":"https?:|"Last-Modified":"[^"]*")'|\
+ egrep -o '(https?:|:".*"$)' |\
+ tr '\012' \# | sed 's/:#:/ /g'|tr \# '\012' | tr -d \"|\
+ sed 's/ [[:digit:]][[:digit:]]\?:[[:digit:]][[:digit:]]:[[:digit:]][[:digit:]] / /;s/\(https\? \)\(: \)\?[MTWFSa-z]..\.\?, \?/\1/;s/ \([-+][[:digit:]]\{4\}\|[[:upper:]]\{2,3\}\)$//;s/ [[:digit:]]\{1,2\} / /;s/\/[[:digit:]]\{1,2\}\/\([[:digit:]]\{4\}\)$/ \1/'|\
+awk '{c[$0]+=1} END {for (k in c) {print k, c[k]}}'
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workers/bin/timedWhich.sh	Fri Oct 19 11:36:31 2018 +0000
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Test script to split CC WAT files across  threads
+#   to tabulate http vs. https by last-modified date:
+# Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses
+#   If -t, no random wait, just id seconds
+# remove >>errs once tested
+#set -e -o pipefail
+echo $$ > test1.pid
+proc=$1
+res=res$proc
+home=$2
+shift 2
+function lrand {
+# cheap bad little random number generator
+echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
+}
+if [ "$1" = "-t" ]
+then
+ shift
+ pause=$proc
+else
+ pause=$(lrand 60)
+fi
+wp=$1
+touch .running
+function tryRead {
+m=0
+set -o pipefail
+until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
+  curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
+{ egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
+ do
+  # try to avoid lockstep retries
+  echo ${PIPESTATUS[@]} 1>&2
+  sleep $(lrand 10)
+  echo $(date) $2 retry number $m 1>&2
+done
+set +o pipefail
+}
+trap "{ 
+  #set -e -o pipefail
+  ln -s ../nohup.cc .
+  tar -czhf - CC* $res | \
+   ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
+                    mkdir -p pdf/wat
+                    cd pdf/wat
+                    tar -xzf - ; } 2>>errs\"
+  rm -rf $res CC* ifile.txt *.pid
+  ( sleep 5 ; rm nohup.cc ) &
+  }" EXIT
+mkdir -p $res
+log=$res/log
+# Don't all start at once
+sleep $pause
+echo \# $(date) >  $log
+pRes=0
+while read s id
+do
+ url="https://commoncrawl.s3.amazonaws.com/$s"
+ ccm=${s##*/wat/}
+ cci=${ccm%%-ip*}
+ export ID=$id
+ echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
+ # Experimental retry loop
+ tryRead "$url" crawl$id
+ if [ -s crawl$id ]
+ then
+  echo \# $(date) $id $(wc -l crawl$id) >> $log
+  parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
+ else
+  echo "crawl$id empty" 1>&2
+ fi
+ rm crawl$id
+ cat $res/$cci.* > $cci
+done < ifile.txt 2>> $res/errs || pRes=$?
+echo \# $(date) main loop exit code=$pRes >> $log
+rm .running
+
+