comparison master/src/wecu/sac_schemes.py @ 63:d46c8b12fc04

support multiple approaches to key combination, use local files to collect results
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 16:40:34 +0000
parents 892e1c0240e1
children b04870ab3035
comparison
equal deleted inserted replaced
62:892e1c0240e1 63:d46c8b12fc04
1 #!/usr/bin/python3 1 #!/usr/bin/python3
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' 2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary
3
4 Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme]
5
6 where altStorageScheme if present selects an alternative approach to storing triple counts:
7 [absent]: three nested dictionaries
8 1: one dictionary indexed by 4-tuple
9 2: one dictionary indexed by ".".join(keys)'''
3 10
4 import sys, json, regex 11 import sys, json, regex
5 from collections.abc import Iterable 12 from collections.abc import Iterable
13
14 if len(sys.argv)>1 and sys.argv[1]=='-d':
15 sys.argv.pop(1)
16 dictRes=True
17 else:
18 dictRes=False
6 19
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] 20 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
8 21
9 PATHS={'hdr':['Headers'], 22 PATHS={'hdr':['Headers'],
10 'head':['HTML-Metadata','Head'], 23 'head':['HTML-Metadata','Head'],
11 'body':['HTML-Metadata','Links']} 24 'body':['HTML-Metadata','Links']}
12 25
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') 26 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) 27 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
28
29 EMPTY=''
15 30
16 def walk(o,f,r,path=None): 31 def walk(o,f,r,path=None):
17 '''Apply f to every key+leaf of a json object in region r''' 32 '''Apply f to every key+leaf of a json object in region r'''
18 if isinstance(o,dict): 33 if isinstance(o,dict):
19 for k,v in o.items(): 34 for k,v in o.items():
36 elif isinstance(o,Iterable): 51 elif isinstance(o,Iterable):
37 for i in o: 52 for i in o:
38 walk(i,f,r,path) 53 walk(i,f,r,path)
39 54
40 def pp(v,k,p,r): 55 def pp(v,k,p,r):
56 '''Uses nested dictionaries'''
41 if isinstance(v,str): 57 if isinstance(v,str):
42 m=SCHEME.match(v) 58 m=SCHEME.match(v)
43 if m is not None: 59 if m is not None:
44 n=URN.match(v) 60 n=URN.match(v)
45 if n is not None: 61 if n is not None:
46 m=n 62 m=n
47 s=m.group(1) 63 s=m.group(1)
64 # The following assumes paths are always either length 1 or length 2!!!
65 # by open-coding rather than using qq(p)
66 if p is not None:
67 assert p[0] is None
68 p=p[1]
48 d=res[r].setdefault(p,dict()) 69 d=res[r].setdefault(p,dict())
49 d=d.setdefault(k,dict()) 70 d=d.setdefault(k,dict())
50 d[s]=d.get(s,0)+1 71 d[s]=d.get(s,0)+1
51 72
73 def pp_tuple(v,k,p,r):
74 '''Uses one dict and 4-tuple'''
75 if isinstance(v,str):
76 m=SCHEME.match(v)
77 if m is not None:
78 n=URN.match(v)
79 if n is not None:
80 m=n
81 s=m.group(1)
82 # The following assumes paths are always either length 1 or length 2!!!
83 # by open-coding rather than using qq(p)
84 if p is not None:
85 assert p[0] is None
86 p=p[1]
87 k=(r,p,k,s)
88 res[k]=res.get(k,0)+1
89
90 SEP='\x00'
91 DOT='.'
92
93 def pp_concat(v,k,p,r):
94 '''Uses one dict and one string'''
95 if isinstance(v,str):
96 m=SCHEME.match(v)
97 if m is not None:
98 n=URN.match(v)
99 if n is not None:
100 m=n
101 s=m.group(1)
102 # The following assumes paths are always either length 1 or length 2!!!
103 # by open-coding rather than using qq(p)
104 if p is None:
105 p=EMPTY
106 else:
107 assert p[0] is None
108 p=p[1]
109 k=SEP.join((r,p,k,s))
110 res[k]=res.get(k,0)+1
111
112 def dump(res):
113 for r in res.keys():
114 rv=res[r]
115 for p in rv.keys():
116 pv=rv[p]
117 for k,v in pv.items():
118 for s,c in v.items():
119 print(r,end=EMPTY)
120 if p is None:
121 print(EMPTY,end='\t')
122 else:
123 print('.',p,sep=EMPTY,end='\t')
124 print(k,end='\t')
125 print(s,c,sep='\t')
126
127 def dump_tuple(res):
128 for (r,p,k,s),c in res.items():
129 print(r,end=EMPTY)
130 # The following assumes paths are always either length 1 or length 2!!!
131 # by open-coding rather than using qq(p)
132 if p is None:
133 print(EMPTY,end='\t')
134 else:
135 print(DOT,p,sep=EMPTY,end='\t')
136 print(k,end='\t')
137 print(s,c,sep='\t')
138
139 def dump_concat(res):
140 for ks,c in res.items():
141 (r,p,k,s)=ks.split(SEP)
142 print(r,end=EMPTY)
143 # The following assumes paths are always either length 1 or length 2!!!
144 # by open-coding rather than using qq(p)
145 if p==EMPTY:
146 print(EMPTY,end='\t')
147 else:
148 print('.',p,sep=EMPTY,end='\t')
149 print(k,end='\t')
150 print(s,c,sep='\t')
151
152 if len(sys.argv)==2:
153 res=dict()
154 if sys.argv[1]=='1':
155 pp=pp_tuple
156 dump=dump_tuple
157 else:
158 pp=pp_concat
159 dump=dump_concat
160 else:
161 res=dict((r,dict()) for r in PATHS.keys())
162
52 def main(): 163 def main():
53 global n,res # for debugging 164 global n # for debugging
54 n=0 165 n=0
55 res=dict((r,dict()) for r in PATHS.keys())
56 for l in sys.stdin: 166 for l in sys.stdin:
57 if l[0]=='{' and '"WARC-Type":"response"' in l: 167 if l[0]=='{' and '"WARC-Type":"response"' in l:
58 j=json.loads(l) 168 j=json.loads(l)
59 n+=1 169 n+=1
60 for s in META_PATH: 170 for s in META_PATH:
68 continue 178 continue
69 walk(p,pp,k) 179 walk(p,pp,k)
70 180
71 print(n,file=sys.stderr) 181 print(n,file=sys.stderr)
72 182
73 for r in res.keys(): 183 if dictRes:
74 rv=res[r] 184 print('res=',end=EMPTY)
75 for p in rv.keys(): 185 from pprint import pprint
76 pv=rv[p] 186 pprint(res)
77 for k,v in pv.items(): 187 else:
78 for s,c in v.items(): 188 dump(res)
79 print(r,end='')
80 # The following assumes paths are always either length 1 or length 2!!!
81 # by open-coding rather than using qq(p)
82 if p is None:
83 print('',end='\t')
84 else:
85 assert p[0] is None
86 print('.',p[1],sep='',end='\t')
87 print(k,end='\t')
88 print(s,c,sep='\t')
89 189
90 def qq(p): 190 def qq(p):
91 if p is None: 191 if p is None:
92 sys.stdout.write('\t') 192 sys.stdout.write('\t')
93 else: 193 else: