changeset 40:4cf6bc21f683

start work on python version of tW.sh
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 13:43:36 +0000
parents bb09db2afe6b
children 3313edbab3b0
files workers/bin/_timedWhich.py
diffstat 1 files changed, 18 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workers/bin/_timedWhich.py	Fri Nov 30 13:43:36 2018 +0000
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+import re,sys,io
+
+uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
+p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
+p2=re.compile('"Last-Modified":"([^"]*)"')
+w={}
+wo={}
+for l in uin:
+  m=p1.search(l)
+  if m:
+    k=m.group(1)
+    m=p2.search(l,m.end())
+    if m is None:
+      wo[k]=wo.get(k,0)+1
+    else:
+      w[k]=w.get(k,0)+1
+print("with %s\nw/o %s"%(w,wo))