Mercurial > hg > cc > azure
view master/src/wecu/sac_schemes.py @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
line wrap: on
line source
#!/usr/bin/python3 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' import sys, json, regex from collections.abc import Iterable META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] PATHS={'hdr':['Headers'], 'head':['HTML-Metadata','Head'], 'body':['HTML-Metadata','Links']} SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) def walk(o,f,r,path=None): '''Apply f to every key+leaf of a json object in region r''' if isinstance(o,dict): for k,v in o.items(): if isinstance(v,dict): walk(v,f,r,(path,k)) elif isinstance(v,Iterable): walked=False for i in v: if isinstance(i,dict): if (not walked) and (i is not v[0]): print('oops',key,path,k,i,file=sys.stderr) walked=True walk(i,f,r,(path,k)) elif walked: print('oops2',key,path,k,i,file=sys.stderr) if not walked: f(v,k,path,r) else: f(v,k,path,r) elif isinstance(o,Iterable): for i in o: walk(i,f,r,path) def pp(v,k,p,r): if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) d=res[r].setdefault(p,dict()) d=d.setdefault(k,dict()) d[s]=d.get(s,0)+1 def main(): global n,res # for debugging n=0 res=dict((r,dict()) for r in PATHS.keys()) for l in sys.stdin: if l[0]=='{' and '"WARC-Type":"response"' in l: j=json.loads(l) n+=1 for s in META_PATH: j=j[s] for k,v in PATHS.items(): p=j try: for s in v: p=p[s] except KeyError as e: continue walk(p,pp,k) print(n,file=sys.stderr) for r in res.keys(): rv=res[r] for p in rv.keys(): pv=rv[p] for k,v in pv.items(): for s,c in v.items(): print(r,end='') # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is None: print('',end='\t') else: assert p[0] is None print('.',p[1],sep='',end='\t') print(k,end='\t') print(s,c,sep='\t') def qq(p): if p is None: sys.stdout.write('\t') else: qq1(p[0]) print(p[1],end='\t') def qq1(p): if p is None: return else: qq1(p[0]) print(p[1],end='.') if __name__=="__main__": main()