view workers/bin/_timedWhich.py @ 46:7a4e49689935

finally got logging sorted
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 03 Dec 2018 21:10:02 +0000
parents 21152d241e1a
children
line wrap: on
line source

#!/usr/bin/env python3
import re,sys,io

uin=io.TextIOWrapper(sys.stdin.buffer,encoding='latin1')
p1=re.compile('"WARC-Target-URI":"(\w*):.*msgtype=response')
p2=re.compile('"Last-Modified":"([^"]*)"')
sep=re.compile('\.?[, \t]+')
losers=re.compile('(mon|fri|sun)(day)?|tue(sday)?|wed(nesday)?|thu(rsday)?|sat(urday)?|gmt([+-][\d:]+)?|[ap]m|\d\d?:\d\d:(\d\d(\.\d*)?\w*|rd)|\{ts|[-+]\d\d\d\d|\d\d?|:',re.I)
oddlast=re.compile('\d\w+[A-Z]{3,4}|[A-Z]\w+/[A-Z]\w+')
HTTP=0
HTTPS=1
tab=[{},{}]
nd=[0,0] # no date
sn={'http':HTTP,'https':HTTPS}
i=j=0
for l in uin:
  i+=1
  m=p1.search(l)
  if m:
    j+=1
    scheme=m.group(1)
    if scheme=='http':
      k=HTTP
    elif scheme=='https':
      k=HTTPS
    else:
      scheme=scheme.lower()
      try:
        k=sn[scheme]
      except KeyError:
        k=len(sn)+1
        sn[scheme]=k
        tab.append(dict())
        nd.append(0)
    m=p2.search(l,m.end())
    if m is None:
      nd[k]+=1
    else:
      t=tab[k]
      lm=m.group(1)
      lmc=sep.split(lm)
      if len(lmc)==1 and lmc[0].startswith('serve-proxy-cache:'):
        r='serve-proxy-cache:'
      elif len(lmc)>14 and lmc[-2][-1]==')':
        # e.g. Sun, 23 Apr 2017 11:10(02017Sun, 23 Apr 2017 11:10:29 +0300Sun, 23 Apr 2017 11:10:29 +030017) GMT
        lmc=lmc[:-12]
      else:
        if oddlast.fullmatch(lmc[-1]):
          lmc.pop()
        r=' '.join(c for c in lmc if not losers.fullmatch(c))
      t[r]=t.get(r,0)+1
for l,h in sn.items():
  if nd[h]>0:
    print("%s\t\t%s"%(l,nd[h]))
  for (k,v) in tab[h].items():
    print("%s\t%s\t%s"%(l,k,v))
print("# %s lines, %s responses"%(i,j),file=sys.stderr)