Mercurial > hg > cc > azure
view master/src/wecu/sac_schemes.py @ 63:d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 16:40:34 +0000 |
parents | 892e1c0240e1 |
children | b04870ab3035 |
line wrap: on
line source
#!/usr/bin/python3 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme] where altStorageScheme if present selects an alternative approach to storing triple counts: [absent]: three nested dictionaries 1: one dictionary indexed by 4-tuple 2: one dictionary indexed by ".".join(keys)''' import sys, json, regex from collections.abc import Iterable if len(sys.argv)>1 and sys.argv[1]=='-d': sys.argv.pop(1) dictRes=True else: dictRes=False META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] PATHS={'hdr':['Headers'], 'head':['HTML-Metadata','Head'], 'body':['HTML-Metadata','Links']} SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) EMPTY='' def walk(o,f,r,path=None): '''Apply f to every key+leaf of a json object in region r''' if isinstance(o,dict): for k,v in o.items(): if isinstance(v,dict): walk(v,f,r,(path,k)) elif isinstance(v,Iterable): walked=False for i in v: if isinstance(i,dict): if (not walked) and (i is not v[0]): print('oops',key,path,k,i,file=sys.stderr) walked=True walk(i,f,r,(path,k)) elif walked: print('oops2',key,path,k,i,file=sys.stderr) if not walked: f(v,k,path,r) else: f(v,k,path,r) elif isinstance(o,Iterable): for i in o: walk(i,f,r,path) def pp(v,k,p,r): '''Uses nested dictionaries''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is not None: assert p[0] is None p=p[1] d=res[r].setdefault(p,dict()) d=d.setdefault(k,dict()) d[s]=d.get(s,0)+1 def pp_tuple(v,k,p,r): '''Uses one dict and 4-tuple''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is not None: assert p[0] is None p=p[1] k=(r,p,k,s) res[k]=res.get(k,0)+1 SEP='\x00' DOT='.' def pp_concat(v,k,p,r): '''Uses one dict and one string''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is None: p=EMPTY else: assert p[0] is None p=p[1] k=SEP.join((r,p,k,s)) res[k]=res.get(k,0)+1 def dump(res): for r in res.keys(): rv=res[r] for p in rv.keys(): pv=rv[p] for k,v in pv.items(): for s,c in v.items(): print(r,end=EMPTY) if p is None: print(EMPTY,end='\t') else: print('.',p,sep=EMPTY,end='\t') print(k,end='\t') print(s,c,sep='\t') def dump_tuple(res): for (r,p,k,s),c in res.items(): print(r,end=EMPTY) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is None: print(EMPTY,end='\t') else: print(DOT,p,sep=EMPTY,end='\t') print(k,end='\t') print(s,c,sep='\t') def dump_concat(res): for ks,c in res.items(): (r,p,k,s)=ks.split(SEP) print(r,end=EMPTY) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p==EMPTY: print(EMPTY,end='\t') else: print('.',p,sep=EMPTY,end='\t') print(k,end='\t') print(s,c,sep='\t') if len(sys.argv)==2: res=dict() if sys.argv[1]=='1': pp=pp_tuple dump=dump_tuple else: pp=pp_concat dump=dump_concat else: res=dict((r,dict()) for r in PATHS.keys()) def main(): global n # for debugging n=0 for l in sys.stdin: if l[0]=='{' and '"WARC-Type":"response"' in l: j=json.loads(l) n+=1 for s in META_PATH: j=j[s] for k,v in PATHS.items(): p=j try: for s in v: p=p[s] except KeyError as e: continue walk(p,pp,k) print(n,file=sys.stderr) if dictRes: print('res=',end=EMPTY) from pprint import pprint pprint(res) else: dump(res) def qq(p): if p is None: sys.stdout.write('\t') else: qq1(p[0]) print(p[1],end='\t') def qq1(p): if p is None: return else: qq1(p[0]) print(p[1],end='.') if __name__=="__main__": main()