changeset 42:1d776e96c16a

works on one file
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 15:41:02 +0000
parents 3313edbab3b0
children c2b72d29a3ee
files workers/bin/_timedWhich.py
diffstat 1 files changed, 26 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/workers/bin/_timedWhich.py	Fri Nov 30 13:44:50 2018 +0000
+++ b/workers/bin/_timedWhich.py	Fri Nov 30 15:41:02 2018 +0000
@@ -4,15 +4,35 @@
 uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
 p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
 p2=re.compile('"Last-Modified":"([^"]*)"')
-w={}
-wo={}
+sep=re.compile('\.?[, \t]+')
+losers=re.compile('(mon|tue|wed|thu|fri|sat|sun)(day)?|gmt|[ap]m|\d\d?:\d\d:\d\d(\.\d*)?\w*|[-+]\d\d\d\d|\d\d?|:',re.I)
+oddz=re.compile('[A-Z]\w+/[A-Z]\w+')
+HTTP=0
+HTTPS=1
+tab=[{},{}]
+nd=[0,0] # no date
+sn=['http','https']
 for l in uin:
   m=p1.search(l)
   if m:
-    k=m.group(1)
+    k=HTTP if m.group(1)=='http' else HTTPS
     m=p2.search(l,m.end())
     if m is None:
-      wo[k]=wo.get(k,0)+1
+      nd[k]+=1
     else:
-      w[k]=w.get(k,0)+1
-print("with %s\nw/o %s"%(w,wo))
+      t=tab[k]
+      lm=m.group(1)
+      lmc=sep.split(lm)
+      if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
+        r='serve-proxy-cache:'
+      else:
+        if oddz.fullmatch(lmc[-1]):
+          lmc.pop()
+        r=' '.join(c for c in lmc if not losers.fullmatch(c))
+      t[r]=t.get(r,0)+1
+for h in (HTTP,HTTPS):
+  print("%s\t\t%s"%(sn[h],nd[h]))
+  for (k,v) in tab[h].items():
+    print("%s\t%s\t%s"%(sn[h],k,v))
+
+