Mercurial > hg > cc > azure
comparison master/src/wecu/sac_schemes.py @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | |
children | 892e1c0240e1 |
comparison
equal
deleted
inserted
replaced
60:5fdca5baa4e9 | 61:cfaf5223b071 |
---|---|
1 #!/usr/bin/python3 | |
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' | |
3 | |
4 import sys, json, regex | |
5 from collections.abc import Iterable | |
6 | |
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] | |
8 | |
9 PATHS={'hdr':['Headers'], | |
10 'head':['HTML-Metadata','Head'], | |
11 'body':['HTML-Metadata','Links']} | |
12 | |
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') | |
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) | |
15 | |
16 def walk(o,f,path=""): | |
17 '''Apply f to every key+leaf of a json object''' | |
18 if isinstance(o,dict): | |
19 for k,v in o.items(): | |
20 if isinstance(v,dict): | |
21 walk(v,f,"%s.%s"%(path,k)) | |
22 elif isinstance(v,Iterable): | |
23 walked=False | |
24 for i in v: | |
25 if isinstance(i,dict): | |
26 if (not walked) and (i is not v[0]): | |
27 print('oops',path,k,i,file=sys.stderr) | |
28 walked=True | |
29 walk(i,f,"%s.%s"%(path,k)) | |
30 elif walked: | |
31 print('oops2',path,k,i,file=sys.stderr) | |
32 if not walked: | |
33 f(k,v,"%s.%s"%(path,k)) | |
34 else: | |
35 f(k,v,"%s.%s"%(path,k)) | |
36 elif isinstance(o,Iterable): | |
37 for i in o: | |
38 walk(i,f,path) | |
39 | |
40 def pp(k,v,p): | |
41 if isinstance(v,str): | |
42 m=SCHEME.match(v) | |
43 if m is not None: | |
44 try: | |
45 n=v.index('\n') | |
46 v=v[:n] | |
47 except ValueError: | |
48 pass | |
49 n=URN.match(v) | |
50 if n is not None: | |
51 m=n | |
52 print(p,m.group(1),sep='\t') | |
53 | |
54 n=0 | |
55 for l in sys.stdin: | |
56 n+=1 | |
57 if n%1000==0: | |
58 print(int(n/1000),file=sys.stderr) | |
59 if l[0]=='{' and '"WARC-Type":"response"' in l: | |
60 j=json.loads(l) | |
61 for s in META_PATH: | |
62 j=j[s] | |
63 for k,v in PATHS.items(): | |
64 p=j | |
65 try: | |
66 for s in v: | |
67 p=p[s] | |
68 except KeyError: | |
69 continue | |
70 walk(p,pp,k) |