Mercurial > hg > cc > azure
diff master/src/wecu/sac_schemes.py @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | |
children | 892e1c0240e1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/master/src/wecu/sac_schemes.py Sun May 31 12:06:44 2020 +0000 @@ -0,0 +1,70 @@ +#!/usr/bin/python3 +'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' + +import sys, json, regex +from collections.abc import Iterable + +META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] + +PATHS={'hdr':['Headers'], + 'head':['HTML-Metadata','Head'], + 'body':['HTML-Metadata','Links']} + +SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') +URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) + +def walk(o,f,path=""): + '''Apply f to every key+leaf of a json object''' + if isinstance(o,dict): + for k,v in o.items(): + if isinstance(v,dict): + walk(v,f,"%s.%s"%(path,k)) + elif isinstance(v,Iterable): + walked=False + for i in v: + if isinstance(i,dict): + if (not walked) and (i is not v[0]): + print('oops',path,k,i,file=sys.stderr) + walked=True + walk(i,f,"%s.%s"%(path,k)) + elif walked: + print('oops2',path,k,i,file=sys.stderr) + if not walked: + f(k,v,"%s.%s"%(path,k)) + else: + f(k,v,"%s.%s"%(path,k)) + elif isinstance(o,Iterable): + for i in o: + walk(i,f,path) + +def pp(k,v,p): + if isinstance(v,str): + m=SCHEME.match(v) + if m is not None: + try: + n=v.index('\n') + v=v[:n] + except ValueError: + pass + n=URN.match(v) + if n is not None: + m=n + print(p,m.group(1),sep='\t') + +n=0 +for l in sys.stdin: + n+=1 + if n%1000==0: + print(int(n/1000),file=sys.stderr) + if l[0]=='{' and '"WARC-Type":"response"' in l: + j=json.loads(l) + for s in META_PATH: + j=j[s] + for k,v in PATHS.items(): + p=j + try: + for s in v: + p=p[s] + except KeyError: + continue + walk(p,pp,k)