Mercurial > hg > cc > azure
comparison master/src/wecu/sac_schemes.py @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
comparison
equal
deleted
inserted
replaced
61:cfaf5223b071 | 62:892e1c0240e1 |
---|---|
11 'body':['HTML-Metadata','Links']} | 11 'body':['HTML-Metadata','Links']} |
12 | 12 |
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') | 13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') |
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) | 14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) |
15 | 15 |
16 def walk(o,f,path=""): | 16 def walk(o,f,r,path=None): |
17 '''Apply f to every key+leaf of a json object''' | 17 '''Apply f to every key+leaf of a json object in region r''' |
18 if isinstance(o,dict): | 18 if isinstance(o,dict): |
19 for k,v in o.items(): | 19 for k,v in o.items(): |
20 if isinstance(v,dict): | 20 if isinstance(v,dict): |
21 walk(v,f,"%s.%s"%(path,k)) | 21 walk(v,f,r,(path,k)) |
22 elif isinstance(v,Iterable): | 22 elif isinstance(v,Iterable): |
23 walked=False | 23 walked=False |
24 for i in v: | 24 for i in v: |
25 if isinstance(i,dict): | 25 if isinstance(i,dict): |
26 if (not walked) and (i is not v[0]): | 26 if (not walked) and (i is not v[0]): |
27 print('oops',path,k,i,file=sys.stderr) | 27 print('oops',key,path,k,i,file=sys.stderr) |
28 walked=True | 28 walked=True |
29 walk(i,f,"%s.%s"%(path,k)) | 29 walk(i,f,r,(path,k)) |
30 elif walked: | 30 elif walked: |
31 print('oops2',path,k,i,file=sys.stderr) | 31 print('oops2',key,path,k,i,file=sys.stderr) |
32 if not walked: | 32 if not walked: |
33 f(k,v,"%s.%s"%(path,k)) | 33 f(v,k,path,r) |
34 else: | 34 else: |
35 f(k,v,"%s.%s"%(path,k)) | 35 f(v,k,path,r) |
36 elif isinstance(o,Iterable): | 36 elif isinstance(o,Iterable): |
37 for i in o: | 37 for i in o: |
38 walk(i,f,path) | 38 walk(i,f,r,path) |
39 | 39 |
40 def pp(k,v,p): | 40 def pp(v,k,p,r): |
41 if isinstance(v,str): | 41 if isinstance(v,str): |
42 m=SCHEME.match(v) | 42 m=SCHEME.match(v) |
43 if m is not None: | 43 if m is not None: |
44 try: | |
45 n=v.index('\n') | |
46 v=v[:n] | |
47 except ValueError: | |
48 pass | |
49 n=URN.match(v) | 44 n=URN.match(v) |
50 if n is not None: | 45 if n is not None: |
51 m=n | 46 m=n |
52 print(p,m.group(1),sep='\t') | 47 s=m.group(1) |
48 d=res[r].setdefault(p,dict()) | |
49 d=d.setdefault(k,dict()) | |
50 d[s]=d.get(s,0)+1 | |
53 | 51 |
54 n=0 | 52 def main(): |
55 for l in sys.stdin: | 53 global n,res # for debugging |
56 n+=1 | 54 n=0 |
57 if n%1000==0: | 55 res=dict((r,dict()) for r in PATHS.keys()) |
58 print(int(n/1000),file=sys.stderr) | 56 for l in sys.stdin: |
59 if l[0]=='{' and '"WARC-Type":"response"' in l: | 57 if l[0]=='{' and '"WARC-Type":"response"' in l: |
60 j=json.loads(l) | 58 j=json.loads(l) |
61 for s in META_PATH: | 59 n+=1 |
62 j=j[s] | 60 for s in META_PATH: |
63 for k,v in PATHS.items(): | 61 j=j[s] |
64 p=j | 62 for k,v in PATHS.items(): |
65 try: | 63 p=j |
66 for s in v: | 64 try: |
67 p=p[s] | 65 for s in v: |
68 except KeyError: | 66 p=p[s] |
69 continue | 67 except KeyError as e: |
70 walk(p,pp,k) | 68 continue |
69 walk(p,pp,k) | |
70 | |
71 print(n,file=sys.stderr) | |
72 | |
73 for r in res.keys(): | |
74 rv=res[r] | |
75 for p in rv.keys(): | |
76 pv=rv[p] | |
77 for k,v in pv.items(): | |
78 for s,c in v.items(): | |
79 print(r,end='') | |
80 # The following assumes paths are always either length 1 or length 2!!! | |
81 # by open-coding rather than using qq(p) | |
82 if p is None: | |
83 print('',end='\t') | |
84 else: | |
85 assert p[0] is None | |
86 print('.',p[1],sep='',end='\t') | |
87 print(k,end='\t') | |
88 print(s,c,sep='\t') | |
89 | |
90 def qq(p): | |
91 if p is None: | |
92 sys.stdout.write('\t') | |
93 else: | |
94 qq1(p[0]) | |
95 print(p[1],end='\t') | |
96 | |
97 def qq1(p): | |
98 if p is None: | |
99 return | |
100 else: | |
101 qq1(p[0]) | |
102 print(p[1],end='.') | |
103 | |
104 if __name__=="__main__": | |
105 main() |