comparison master/src/wecu/sac_schemes.py @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents
children 892e1c0240e1
comparison
equal deleted inserted replaced
60:5fdca5baa4e9 61:cfaf5223b071
1 #!/usr/bin/python3
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''
3
4 import sys, json, regex
5 from collections.abc import Iterable
6
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
8
9 PATHS={'hdr':['Headers'],
10 'head':['HTML-Metadata','Head'],
11 'body':['HTML-Metadata','Links']}
12
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
15
16 def walk(o,f,path=""):
17 '''Apply f to every key+leaf of a json object'''
18 if isinstance(o,dict):
19 for k,v in o.items():
20 if isinstance(v,dict):
21 walk(v,f,"%s.%s"%(path,k))
22 elif isinstance(v,Iterable):
23 walked=False
24 for i in v:
25 if isinstance(i,dict):
26 if (not walked) and (i is not v[0]):
27 print('oops',path,k,i,file=sys.stderr)
28 walked=True
29 walk(i,f,"%s.%s"%(path,k))
30 elif walked:
31 print('oops2',path,k,i,file=sys.stderr)
32 if not walked:
33 f(k,v,"%s.%s"%(path,k))
34 else:
35 f(k,v,"%s.%s"%(path,k))
36 elif isinstance(o,Iterable):
37 for i in o:
38 walk(i,f,path)
39
40 def pp(k,v,p):
41 if isinstance(v,str):
42 m=SCHEME.match(v)
43 if m is not None:
44 try:
45 n=v.index('\n')
46 v=v[:n]
47 except ValueError:
48 pass
49 n=URN.match(v)
50 if n is not None:
51 m=n
52 print(p,m.group(1),sep='\t')
53
54 n=0
55 for l in sys.stdin:
56 n+=1
57 if n%1000==0:
58 print(int(n/1000),file=sys.stderr)
59 if l[0]=='{' and '"WARC-Type":"response"' in l:
60 j=json.loads(l)
61 for s in META_PATH:
62 j=j[s]
63 for k,v in PATHS.items():
64 p=j
65 try:
66 for s in v:
67 p=p[s]
68 except KeyError:
69 continue
70 walk(p,pp,k)