comparison master/src/wecu/sac_schemes.py @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
comparison
equal deleted inserted replaced
61:cfaf5223b071 62:892e1c0240e1
11 'body':['HTML-Metadata','Links']} 11 'body':['HTML-Metadata','Links']}
12 12
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') 13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) 14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
15 15
16 def walk(o,f,path=""): 16 def walk(o,f,r,path=None):
17 '''Apply f to every key+leaf of a json object''' 17 '''Apply f to every key+leaf of a json object in region r'''
18 if isinstance(o,dict): 18 if isinstance(o,dict):
19 for k,v in o.items(): 19 for k,v in o.items():
20 if isinstance(v,dict): 20 if isinstance(v,dict):
21 walk(v,f,"%s.%s"%(path,k)) 21 walk(v,f,r,(path,k))
22 elif isinstance(v,Iterable): 22 elif isinstance(v,Iterable):
23 walked=False 23 walked=False
24 for i in v: 24 for i in v:
25 if isinstance(i,dict): 25 if isinstance(i,dict):
26 if (not walked) and (i is not v[0]): 26 if (not walked) and (i is not v[0]):
27 print('oops',path,k,i,file=sys.stderr) 27 print('oops',key,path,k,i,file=sys.stderr)
28 walked=True 28 walked=True
29 walk(i,f,"%s.%s"%(path,k)) 29 walk(i,f,r,(path,k))
30 elif walked: 30 elif walked:
31 print('oops2',path,k,i,file=sys.stderr) 31 print('oops2',key,path,k,i,file=sys.stderr)
32 if not walked: 32 if not walked:
33 f(k,v,"%s.%s"%(path,k)) 33 f(v,k,path,r)
34 else: 34 else:
35 f(k,v,"%s.%s"%(path,k)) 35 f(v,k,path,r)
36 elif isinstance(o,Iterable): 36 elif isinstance(o,Iterable):
37 for i in o: 37 for i in o:
38 walk(i,f,path) 38 walk(i,f,r,path)
39 39
40 def pp(k,v,p): 40 def pp(v,k,p,r):
41 if isinstance(v,str): 41 if isinstance(v,str):
42 m=SCHEME.match(v) 42 m=SCHEME.match(v)
43 if m is not None: 43 if m is not None:
44 try:
45 n=v.index('\n')
46 v=v[:n]
47 except ValueError:
48 pass
49 n=URN.match(v) 44 n=URN.match(v)
50 if n is not None: 45 if n is not None:
51 m=n 46 m=n
52 print(p,m.group(1),sep='\t') 47 s=m.group(1)
48 d=res[r].setdefault(p,dict())
49 d=d.setdefault(k,dict())
50 d[s]=d.get(s,0)+1
53 51
54 n=0 52 def main():
55 for l in sys.stdin: 53 global n,res # for debugging
56 n+=1 54 n=0
57 if n%1000==0: 55 res=dict((r,dict()) for r in PATHS.keys())
58 print(int(n/1000),file=sys.stderr) 56 for l in sys.stdin:
59 if l[0]=='{' and '"WARC-Type":"response"' in l: 57 if l[0]=='{' and '"WARC-Type":"response"' in l:
60 j=json.loads(l) 58 j=json.loads(l)
61 for s in META_PATH: 59 n+=1
62 j=j[s] 60 for s in META_PATH:
63 for k,v in PATHS.items(): 61 j=j[s]
64 p=j 62 for k,v in PATHS.items():
65 try: 63 p=j
66 for s in v: 64 try:
67 p=p[s] 65 for s in v:
68 except KeyError: 66 p=p[s]
69 continue 67 except KeyError as e:
70 walk(p,pp,k) 68 continue
69 walk(p,pp,k)
70
71 print(n,file=sys.stderr)
72
73 for r in res.keys():
74 rv=res[r]
75 for p in rv.keys():
76 pv=rv[p]
77 for k,v in pv.items():
78 for s,c in v.items():
79 print(r,end='')
80 # The following assumes paths are always either length 1 or length 2!!!
81 # by open-coding rather than using qq(p)
82 if p is None:
83 print('',end='\t')
84 else:
85 assert p[0] is None
86 print('.',p[1],sep='',end='\t')
87 print(k,end='\t')
88 print(s,c,sep='\t')
89
90 def qq(p):
91 if p is None:
92 sys.stdout.write('\t')
93 else:
94 qq1(p[0])
95 print(p[1],end='\t')
96
97 def qq1(p):
98 if p is None:
99 return
100 else:
101 qq1(p[0])
102 print(p[1],end='.')
103
104 if __name__=="__main__":
105 main()