Mercurial > hg > cc > azure
view master/src/wecu/sac_schemes.py @ 66:b04870ab3035
don't over-count duplicate URIs in multiple properties, produce composite keys instead
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 16:10:55 +0000 |
parents | d46c8b12fc04 |
children | 13182e98a1ab |
line wrap: on
line source
#!/usr/bin/python3 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme] where altStorageScheme if present selects an alternative approach to storing triple counts: [absent]: three nested dictionaries 1: one dictionary indexed by 4-tuple 2: one dictionary indexed by ".".join(keys)''' import sys, json, regex if len(sys.argv)>1 and sys.argv[1]=='-d': sys.argv.pop(1) dictRes=True else: dictRes=False META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] PATHS={'hdr':['Headers'], 'head':['HTML-Metadata','Head'], 'body':['HTML-Metadata','Links']} SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) EMPTY='' D={} def walk(o,f,r,path=None): '''Apply f to every key+leaf of a json object reached via p in region r''' if isinstance(o,dict): for k,v in o.items(): if isinstance(v,dict): walk(v,f,r,(path,k)) elif isinstance(v,(list,tuple)): walked=False for i in v: if isinstance(i,dict): if (not walked) and (i is not v[0]): print('oops',key,path,k,i,file=sys.stderr) walked=True walk(i,f,r,(path,k)) elif walked: print('oops2',key,path,k,i,file=sys.stderr) if not walked: f(v,k,path,r) else: kk=f(v,k,path,r,o) if kk is not None: #print(v,D,kk,file=sys.stderr) if v in D: (rr,pp,jj,ss)=D[v] D[v]=(rr,pp,(jj,k),ss) else: D[v]=kk if D: for kk in D.values(): res[kk]=res.get(kk,0)+1 D.clear() elif isinstance(o,(list,tuple)): for i in o: walk(i,f,r,path) def pp(v,k,p,r,parent=None): '''Handle a leaf value v, with key k in parent, under path p from r Uses nested dictionaries''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is not None: assert p[0] is None p=p[1] d=res[r].setdefault(p,dict()) d=d.setdefault(k,dict()) d[s]=d.get(s,0)+1 def pp_tuple(v,k,p,r,parent=None): '''Handle a leaf value v, with key k in parent, under path p from r Uses one dict and 4-tuple''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is not None: assert p[0] is None p=p[1] if parent is None: res[kk]=res.get(kk,0)+1 else: return (r,p,k,s) SEP='\x00' DOT='.' def pp_concat(v,k,p,r,parent=None): '''Handle a leaf value v, with key k in parent, under path p from r Uses one dict and one string''' if isinstance(v,str): m=SCHEME.match(v) if m is not None: n=URN.match(v) if n is not None: m=n s=m.group(1) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is None: p=EMPTY else: assert p[0] is None p=p[1] k=SEP.join((r,p,k,s)) res[k]=res.get(k,0)+1 def dump(res): for r in res.keys(): rv=res[r] for p in rv.keys(): pv=rv[p] for k,v in pv.items(): for s,c in v.items(): print(r,end=EMPTY) if p is None: print(EMPTY,end='\t') else: print('.',p,sep=EMPTY,end='\t') print(k,end='\t') print(s,c,sep='\t') def dump_tuple(res): for (r,p,k,s),c in res.items(): print(r,end=EMPTY) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p is None: print(EMPTY,end='\t') else: print(DOT,p,sep=EMPTY,end='\t') while isinstance(k,tuple): print(k[1],end='&') k=k[0] print(k,end='\t') print(s,c,sep='\t') def dump_concat(res): for ks,c in res.items(): (r,p,k,s)=ks.split(SEP) print(r,end=EMPTY) # The following assumes paths are always either length 1 or length 2!!! # by open-coding rather than using qq(p) if p==EMPTY: print(EMPTY,end='\t') else: print('.',p,sep=EMPTY,end='\t') print(k,end='\t') print(s,c,sep='\t') if len(sys.argv)==2: res=dict() if sys.argv[1]=='1': print('using tuple',file=sys.stderr) pp=pp_tuple dump=dump_tuple else: print('using concat',file=sys.stderr) pp=pp_concat dump=dump_concat else: print('using nested',file=sys.stderr) res=dict((r,dict()) for r in PATHS.keys()) def main(): global n # for debugging n=0 for l in sys.stdin: if l[0]=='{' and '"WARC-Type":"response"' in l: j=json.loads(l) n+=1 for s in META_PATH: j=j[s] for k,v in PATHS.items(): p=j try: for s in v: p=p[s] except KeyError as e: continue walk(p,pp,k) print(n,file=sys.stderr) if dictRes: print('res=',end=EMPTY) from pprint import pprint pprint(res) else: dump(res) def qq(p): if p is None: sys.stdout.write('\t') else: qq1(p[0]) print(p[1],end='\t') def qq1(p): if p is None: return else: qq1(p[0]) print(p[1],end='.') if __name__=="__main__": main()