view master/src/wecu/sac_schemes.py @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
line wrap: on
line source

#!/usr/bin/python3
'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''

import sys, json, regex
from collections.abc import Iterable

META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']

PATHS={'hdr':['Headers'],
       'head':['HTML-Metadata','Head'],
       'body':['HTML-Metadata','Links']}

SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)

def walk(o,f,r,path=None):
  '''Apply f to every key+leaf of a json object in region r'''
  if isinstance(o,dict):
    for k,v in o.items():
      if isinstance(v,dict):
        walk(v,f,r,(path,k))
      elif isinstance(v,Iterable):
        walked=False
        for i in v:
          if isinstance(i,dict):
            if (not walked) and (i is not v[0]):
              print('oops',key,path,k,i,file=sys.stderr)
            walked=True
            walk(i,f,r,(path,k))
          elif walked:
            print('oops2',key,path,k,i,file=sys.stderr)
        if not walked:
          f(v,k,path,r)
      else:
        f(v,k,path,r)
  elif isinstance(o,Iterable):
    for i in o:
      walk(i,f,r,path)

def pp(v,k,p,r):
  if isinstance(v,str):
    m=SCHEME.match(v)
    if m is not None:
      n=URN.match(v)
      if n is not None:
        m=n
      s=m.group(1)
      d=res[r].setdefault(p,dict())
      d=d.setdefault(k,dict())
      d[s]=d.get(s,0)+1

def main():
  global n,res # for debugging
  n=0
  res=dict((r,dict()) for r in PATHS.keys())
  for l in sys.stdin:
    if l[0]=='{' and '"WARC-Type":"response"' in l:
      j=json.loads(l)
      n+=1
      for s in META_PATH:
        j=j[s]
      for k,v in PATHS.items():
        p=j
        try:
          for s in v:
            p=p[s]
        except KeyError as e:
          continue
        walk(p,pp,k)

  print(n,file=sys.stderr)

  for r in res.keys():
    rv=res[r]
    for p in rv.keys():
      pv=rv[p]
      for k,v in pv.items():
        for s,c in v.items():
          print(r,end='')
          # The following assumes paths are always either length 1 or length 2!!!
          #  by open-coding rather than using qq(p)
          if p is None:
            print('',end='\t')
          else:
            assert p[0] is None
            print('.',p[1],sep='',end='\t')
          print(k,end='\t')
          print(s,c,sep='\t')

def qq(p):
  if p is None:
    sys.stdout.write('\t')
  else:
    qq1(p[0])
    print(p[1],end='\t')

def qq1(p):
  if p is None:
    return
  else:
    qq1(p[0])
    print(p[1],end='.')

if __name__=="__main__":
  main()