Mercurial > hg > cc > azure
view master/src/wecu/sac_schemes.py @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | |
children | 892e1c0240e1 |
line wrap: on
line source
#!/usr/bin/python3 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' import sys, json, regex from collections.abc import Iterable META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] PATHS={'hdr':['Headers'], 'head':['HTML-Metadata','Head'], 'body':['HTML-Metadata','Links']} SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) def walk(o,f,path=""): '''Apply f to every key+leaf of a json object''' if isinstance(o,dict): for k,v in o.items(): if isinstance(v,dict): walk(v,f,"%s.%s"%(path,k)) elif isinstance(v,Iterable): walked=False for i in v: if isinstance(i,dict): if (not walked) and (i is not v[0]): print('oops',path,k,i,file=sys.stderr) walked=True walk(i,f,"%s.%s"%(path,k)) elif walked: print('oops2',path,k,i,file=sys.stderr) if not walked: f(k,v,"%s.%s"%(path,k)) else: f(k,v,"%s.%s"%(path,k)) elif isinstance(o,Iterable): for i in o: walk(i,f,path) def pp(k,v,p): if isinstance(v,str): m=SCHEME.match(v) if m is not None: try: n=v.index('\n') v=v[:n] except ValueError: pass n=URN.match(v) if n is not None: m=n print(p,m.group(1),sep='\t') n=0 for l in sys.stdin: n+=1 if n%1000==0: print(int(n/1000),file=sys.stderr) if l[0]=='{' and '"WARC-Type":"response"' in l: j=json.loads(l) for s in META_PATH: j=j[s] for k,v in PATHS.items(): p=j try: for s in v: p=p[s] except KeyError: continue walk(p,pp,k)