Mercurial > hg > cc > azure
diff master/src/wecu/sac_schemes.py @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
line wrap: on
line diff
--- a/master/src/wecu/sac_schemes.py Sun May 31 12:06:44 2020 +0000 +++ b/master/src/wecu/sac_schemes.py Tue Jun 02 17:35:07 2020 +0000 @@ -13,58 +13,93 @@ SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) -def walk(o,f,path=""): - '''Apply f to every key+leaf of a json object''' +def walk(o,f,r,path=None): + '''Apply f to every key+leaf of a json object in region r''' if isinstance(o,dict): for k,v in o.items(): if isinstance(v,dict): - walk(v,f,"%s.%s"%(path,k)) + walk(v,f,r,(path,k)) elif isinstance(v,Iterable): walked=False for i in v: if isinstance(i,dict): if (not walked) and (i is not v[0]): - print('oops',path,k,i,file=sys.stderr) + print('oops',key,path,k,i,file=sys.stderr) walked=True - walk(i,f,"%s.%s"%(path,k)) + walk(i,f,r,(path,k)) elif walked: - print('oops2',path,k,i,file=sys.stderr) + print('oops2',key,path,k,i,file=sys.stderr) if not walked: - f(k,v,"%s.%s"%(path,k)) + f(v,k,path,r) else: - f(k,v,"%s.%s"%(path,k)) + f(v,k,path,r) elif isinstance(o,Iterable): for i in o: - walk(i,f,path) + walk(i,f,r,path) -def pp(k,v,p): +def pp(v,k,p,r): if isinstance(v,str): m=SCHEME.match(v) if m is not None: - try: - n=v.index('\n') - v=v[:n] - except ValueError: - pass n=URN.match(v) if n is not None: m=n - print(p,m.group(1),sep='\t') + s=m.group(1) + d=res[r].setdefault(p,dict()) + d=d.setdefault(k,dict()) + d[s]=d.get(s,0)+1 + +def main(): + global n,res # for debugging + n=0 + res=dict((r,dict()) for r in PATHS.keys()) + for l in sys.stdin: + if l[0]=='{' and '"WARC-Type":"response"' in l: + j=json.loads(l) + n+=1 + for s in META_PATH: + j=j[s] + for k,v in PATHS.items(): + p=j + try: + for s in v: + p=p[s] + except KeyError as e: + continue + walk(p,pp,k) + + print(n,file=sys.stderr) -n=0 -for l in sys.stdin: - n+=1 - if n%1000==0: - print(int(n/1000),file=sys.stderr) - if l[0]=='{' and '"WARC-Type":"response"' in l: - j=json.loads(l) - for s in META_PATH: - j=j[s] - for k,v in PATHS.items(): - p=j - try: - for s in v: - p=p[s] - except KeyError: - continue - walk(p,pp,k) + for r in res.keys(): + rv=res[r] + for p in rv.keys(): + pv=rv[p] + for k,v in pv.items(): + for s,c in v.items(): + print(r,end='') + # The following assumes paths are always either length 1 or length 2!!! + # by open-coding rather than using qq(p) + if p is None: + print('',end='\t') + else: + assert p[0] is None + print('.',p[1],sep='',end='\t') + print(k,end='\t') + print(s,c,sep='\t') + +def qq(p): + if p is None: + sys.stdout.write('\t') + else: + qq1(p[0]) + print(p[1],end='\t') + +def qq1(p): + if p is None: + return + else: + qq1(p[0]) + print(p[1],end='.') + +if __name__=="__main__": + main()