Mercurial > hg > cc > azure
diff master/src/wecu/sac_schemes.py @ 63:d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 16:40:34 +0000 |
parents | 892e1c0240e1 |
children | b04870ab3035 |
line wrap: on
line diff
--- a/master/src/wecu/sac_schemes.py Tue Jun 02 17:35:07 2020 +0000 +++ b/master/src/wecu/sac_schemes.py Wed Jun 03 16:40:34 2020 +0000 @@ -1,9 +1,22 @@ #!/usr/bin/python3 -'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' +'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary + +Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme] + +where altStorageScheme if present selects an alternative approach to storing triple counts: + [absent]: three nested dictionaries + 1: one dictionary indexed by 4-tuple + 2: one dictionary indexed by ".".join(keys)''' import sys, json, regex from collections.abc import Iterable +if len(sys.argv)>1 and sys.argv[1]=='-d': + sys.argv.pop(1) + dictRes=True +else: + dictRes=False + META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] PATHS={'hdr':['Headers'], @@ -13,6 +26,8 @@ SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) +EMPTY='' + def walk(o,f,r,path=None): '''Apply f to every key+leaf of a json object in region r''' if isinstance(o,dict): @@ -38,6 +53,45 @@ walk(i,f,r,path) def pp(v,k,p,r): + '''Uses nested dictionaries''' + if isinstance(v,str): + m=SCHEME.match(v) + if m is not None: + n=URN.match(v) + if n is not None: + m=n + s=m.group(1) + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p is not None: + assert p[0] is None + p=p[1] + d=res[r].setdefault(p,dict()) + d=d.setdefault(k,dict()) + d[s]=d.get(s,0)+1 + +def pp_tuple(v,k,p,r): + '''Uses one dict and 4-tuple''' + if isinstance(v,str): + m=SCHEME.match(v) + if m is not None: + n=URN.match(v) + if n is not None: + m=n + s=m.group(1) + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p is not None: + assert p[0] is None + p=p[1] + k=(r,p,k,s) + res[k]=res.get(k,0)+1 + +SEP='\x00' +DOT='.' + +def pp_concat(v,k,p,r): + '''Uses one dict and one string''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: @@ -45,14 +99,70 @@ if n is not None: m=n s=m.group(1) - d=res[r].setdefault(p,dict()) - d=d.setdefault(k,dict()) - d[s]=d.get(s,0)+1 + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p is None: + p=EMPTY + else: + assert p[0] is None + p=p[1] + k=SEP.join((r,p,k,s)) + res[k]=res.get(k,0)+1 + +def dump(res): + for r in res.keys(): + rv=res[r] + for p in rv.keys(): + pv=rv[p] + for k,v in pv.items(): + for s,c in v.items(): + print(r,end=EMPTY) + if p is None: + print(EMPTY,end='\t') + else: + print('.',p,sep=EMPTY,end='\t') + print(k,end='\t') + print(s,c,sep='\t') + +def dump_tuple(res): + for (r,p,k,s),c in res.items(): + print(r,end=EMPTY) + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p is None: + print(EMPTY,end='\t') + else: + print(DOT,p,sep=EMPTY,end='\t') + print(k,end='\t') + print(s,c,sep='\t') + +def dump_concat(res): + for ks,c in res.items(): + (r,p,k,s)=ks.split(SEP) + print(r,end=EMPTY) + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p==EMPTY: + print(EMPTY,end='\t') + else: + print('.',p,sep=EMPTY,end='\t') + print(k,end='\t') + print(s,c,sep='\t') + +if len(sys.argv)==2: + res=dict() + if sys.argv[1]=='1': + pp=pp_tuple + dump=dump_tuple + else: + pp=pp_concat + dump=dump_concat +else: + res=dict((r,dict()) for r in PATHS.keys()) def main(): - global n,res # for debugging + global n # for debugging n=0 - res=dict((r,dict()) for r in PATHS.keys()) for l in sys.stdin: if l[0]=='{' and '"WARC-Type":"response"' in l: j=json.loads(l) @@ -70,22 +180,12 @@ print(n,file=sys.stderr) - for r in res.keys(): - rv=res[r] - for p in rv.keys(): - pv=rv[p] - for k,v in pv.items(): - for s,c in v.items(): - print(r,end='') - # The following assumes paths are always either length 1 or length 2!!! - # by open-coding rather than using qq(p) - if p is None: - print('',end='\t') - else: - assert p[0] is None - print('.',p[1],sep='',end='\t') - print(k,end='\t') - print(s,c,sep='\t') + if dictRes: + print('res=',end=EMPTY) + from pprint import pprint + pprint(res) + else: + dump(res) def qq(p): if p is None: