view workers/bin/_timedWhich.py @ 42:1d776e96c16a

works on one file
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 30 Nov 2018 15:41:02 +0000
parents 4cf6bc21f683
children 1342f6669352
line wrap: on
line source

#!/usr/bin/env python3
import re,sys,io

uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
p1=re.compile('"WARC-Target-URI":"(https?):.*msgtype=response')
p2=re.compile('"Last-Modified":"([^"]*)"')
sep=re.compile('\.?[, \t]+')
losers=re.compile('(mon|tue|wed|thu|fri|sat|sun)(day)?|gmt|[ap]m|\d\d?:\d\d:\d\d(\.\d*)?\w*|[-+]\d\d\d\d|\d\d?|:',re.I)
oddz=re.compile('[A-Z]\w+/[A-Z]\w+')
HTTP=0
HTTPS=1
tab=[{},{}]
nd=[0,0] # no date
sn=['http','https']
for l in uin:
  m=p1.search(l)
  if m:
    k=HTTP if m.group(1)=='http' else HTTPS
    m=p2.search(l,m.end())
    if m is None:
      nd[k]+=1
    else:
      t=tab[k]
      lm=m.group(1)
      lmc=sep.split(lm)
      if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
        r='serve-proxy-cache:'
      else:
        if oddz.fullmatch(lmc[-1]):
          lmc.pop()
        r=' '.join(c for c in lmc if not losers.fullmatch(c))
      t[r]=t.get(r,0)+1
for h in (HTTP,HTTPS):
  print("%s\t\t%s"%(sn[h],nd[h]))
  for (k,v) in tab[h].items():
    print("%s\t%s\t%s"%(sn[h],k,v))