changeset 46:7a4e49689935

finally got logging sorted
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 03 Dec 2018 21:10:02 +0000
parents 21152d241e1a
children 2a0dab424418
files workers/bin/_timedWhich.py workers/bin/ptimedWhich.sh
diffstat 2 files changed, 106 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/workers/bin/_timedWhich.py	Sat Dec 01 16:25:04 2018 +0000
+++ b/workers/bin/_timedWhich.py	Mon Dec 03 21:10:02 2018 +0000
@@ -2,7 +2,7 @@
 import re,sys,io
 
 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
-p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
+p1=re.compile('"WARC-Target-URI":"(\w*):.*msgtype=response')
 p2=re.compile('"Last-Modified":"([^"]*)"')
 sep=re.compile('\.?[, \t]+')
 losers=re.compile('(mon|fri|sun)(day)?|tue(sday)?|wed(nesday)?|thu(rsday)?|sat(urday)?|gmt([+-][\d:]+)?|[ap]m|\d\d?:\d\d:(\d\d(\.\d*)?\w*|rd)|\{ts|[-+]\d\d\d\d|\d\d?|:',re.I)
@@ -11,11 +11,27 @@
 HTTPS=1
 tab=[{},{}]
 nd=[0,0] # no date
-sn=['http','https']
+sn={'http':HTTP,'https':HTTPS}
+i=j=0
 for l in uin:
+  i+=1
   m=p1.search(l)
   if m:
-    k=HTTP if m.group(1)=='http' else HTTPS
+    j+=1
+    scheme=m.group(1)
+    if scheme=='http':
+      k=HTTP
+    elif scheme=='https':
+      k=HTTPS
+    else:
+      scheme=scheme.lower()
+      try:
+        k=sn[scheme]
+      except KeyError:
+        k=len(sn)+1
+        sn[scheme]=k
+        tab.append(dict())
+        nd.append(0)
     m=p2.search(l,m.end())
     if m is None:
       nd[k]+=1
@@ -33,9 +49,17 @@
           lmc.pop()
         r=' '.join(c for c in lmc if not losers.fullmatch(c))
       t[r]=t.get(r,0)+1
-for h in (HTTP,HTTPS):
-  print("%s\t\t%s"%(sn[h],nd[h]))
+for l,h in sn.items():
+  if nd[h]>0:
+    print("%s\t\t%s"%(l,nd[h]))
   for (k,v) in tab[h].items():
-    print("%s\t%s\t%s"%(sn[h],k,v))
+    print("%s\t%s\t%s"%(l,k,v))
+print("# %s lines, %s responses"%(i,j),file=sys.stderr)
 
 
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workers/bin/ptimedWhich.sh	Mon Dec 03 21:10:02 2018 +0000
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Test script to split CC WAT files across  threads
+#   to tabulate http vs. https by last-modified date:
+# Usage: [echo file file_id] | timedWhich.sh id home [-t] numWorkerProcesses
+#   If -t, no random wait, just id seconds
+# remove >>errs once tested
+#set -e -o pipefail
+echo $$ > test1.pid
+proc=$1
+res=/var/data/res$proc
+home=$2
+shift 2
+function lrand {
+# cheap bad little random number generator
+echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
+}
+if [ "$1" = "-t" ]
+then
+ shift
+ pause=$proc
+else
+ pause=$(lrand 60)
+fi
+wp=$1
+touch .running
+function tryread {
+n=$1
+while read u o
+do
+  m=0
+  set -o pipefail
+  until if [ $((m+=1)) -gt 5 ]; then echo " tried 5 times w/o success, giving up" 1>&2; return 1; fi && echo -n \# $(date) "reading $u ..." 1>&2 && \
+   curl -s -S --max-time 60 --insecure -o - "$u" | \
+        { echo "done at " $(date) 1>&2 ; zcat ; } |\
+      _timedWhich.py > "$o"
+   do
+    # try to avoid lockstep retries
+    echo \# ${PIPESTATUS[@]} 1>&2
+    sleep $(lrand 10)
+    echo \# $(date) retry number $m 1>&2
+  done
+  set +o pipefail
+done
+}
+trap "{ 
+  set -e -o pipefail
+  cd /var/data
+  tar -czhf - CC* res* | \
+   ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
+                    mkdir -p which
+                    cd which
+                    tar -xzf - ; } 2>>errs\"
+  rm -rf res* CC*
+  cd
+  rm ifile.txt *.pid
+  ( sleep 5 ; rm nohup.cc ) &
+  }" EXIT
+mkdir -p $res
+log=$res/log
+# Don't all start at once
+sleep $pause
+echo \# $(date) >  $log
+pRes=0
+N=$(wc -l< ifile.txt)
+export -f tryread lrand
+while read s
+do
+ url="https://commoncrawl.s3.amazonaws.com/$s"
+ cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
+ echo $url /var/data/$cci
+done < ifile.txt 2>> $res/errs | \
+ parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$?
+echo \# $(date) main loop exit code=$pRes >> $log
+rm .running
+
+