Mercurial > hg > cc > azure
comparison master/src/wecu/sac_schemes.py @ 63:d46c8b12fc04
support multiple approaches to key combination, use local files to collect results
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Wed, 03 Jun 2020 16:40:34 +0000 |
parents | 892e1c0240e1 |
children | b04870ab3035 |
comparison
equal
deleted
inserted
replaced
62:892e1c0240e1 | 63:d46c8b12fc04 |
---|---|
1 #!/usr/bin/python3 | 1 #!/usr/bin/python3 |
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' | 2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary |
3 | |
4 Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme] | |
5 | |
6 where altStorageScheme if present selects an alternative approach to storing triple counts: | |
7 [absent]: three nested dictionaries | |
8 1: one dictionary indexed by 4-tuple | |
9 2: one dictionary indexed by ".".join(keys)''' | |
3 | 10 |
4 import sys, json, regex | 11 import sys, json, regex |
5 from collections.abc import Iterable | 12 from collections.abc import Iterable |
13 | |
14 if len(sys.argv)>1 and sys.argv[1]=='-d': | |
15 sys.argv.pop(1) | |
16 dictRes=True | |
17 else: | |
18 dictRes=False | |
6 | 19 |
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] | 20 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] |
8 | 21 |
9 PATHS={'hdr':['Headers'], | 22 PATHS={'hdr':['Headers'], |
10 'head':['HTML-Metadata','Head'], | 23 'head':['HTML-Metadata','Head'], |
11 'body':['HTML-Metadata','Links']} | 24 'body':['HTML-Metadata','Links']} |
12 | 25 |
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') | 26 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') |
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) | 27 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) |
28 | |
29 EMPTY='' | |
15 | 30 |
16 def walk(o,f,r,path=None): | 31 def walk(o,f,r,path=None): |
17 '''Apply f to every key+leaf of a json object in region r''' | 32 '''Apply f to every key+leaf of a json object in region r''' |
18 if isinstance(o,dict): | 33 if isinstance(o,dict): |
19 for k,v in o.items(): | 34 for k,v in o.items(): |
36 elif isinstance(o,Iterable): | 51 elif isinstance(o,Iterable): |
37 for i in o: | 52 for i in o: |
38 walk(i,f,r,path) | 53 walk(i,f,r,path) |
39 | 54 |
40 def pp(v,k,p,r): | 55 def pp(v,k,p,r): |
56 '''Uses nested dictionaries''' | |
41 if isinstance(v,str): | 57 if isinstance(v,str): |
42 m=SCHEME.match(v) | 58 m=SCHEME.match(v) |
43 if m is not None: | 59 if m is not None: |
44 n=URN.match(v) | 60 n=URN.match(v) |
45 if n is not None: | 61 if n is not None: |
46 m=n | 62 m=n |
47 s=m.group(1) | 63 s=m.group(1) |
64 # The following assumes paths are always either length 1 or length 2!!! | |
65 # by open-coding rather than using qq(p) | |
66 if p is not None: | |
67 assert p[0] is None | |
68 p=p[1] | |
48 d=res[r].setdefault(p,dict()) | 69 d=res[r].setdefault(p,dict()) |
49 d=d.setdefault(k,dict()) | 70 d=d.setdefault(k,dict()) |
50 d[s]=d.get(s,0)+1 | 71 d[s]=d.get(s,0)+1 |
51 | 72 |
73 def pp_tuple(v,k,p,r): | |
74 '''Uses one dict and 4-tuple''' | |
75 if isinstance(v,str): | |
76 m=SCHEME.match(v) | |
77 if m is not None: | |
78 n=URN.match(v) | |
79 if n is not None: | |
80 m=n | |
81 s=m.group(1) | |
82 # The following assumes paths are always either length 1 or length 2!!! | |
83 # by open-coding rather than using qq(p) | |
84 if p is not None: | |
85 assert p[0] is None | |
86 p=p[1] | |
87 k=(r,p,k,s) | |
88 res[k]=res.get(k,0)+1 | |
89 | |
90 SEP='\x00' | |
91 DOT='.' | |
92 | |
93 def pp_concat(v,k,p,r): | |
94 '''Uses one dict and one string''' | |
95 if isinstance(v,str): | |
96 m=SCHEME.match(v) | |
97 if m is not None: | |
98 n=URN.match(v) | |
99 if n is not None: | |
100 m=n | |
101 s=m.group(1) | |
102 # The following assumes paths are always either length 1 or length 2!!! | |
103 # by open-coding rather than using qq(p) | |
104 if p is None: | |
105 p=EMPTY | |
106 else: | |
107 assert p[0] is None | |
108 p=p[1] | |
109 k=SEP.join((r,p,k,s)) | |
110 res[k]=res.get(k,0)+1 | |
111 | |
112 def dump(res): | |
113 for r in res.keys(): | |
114 rv=res[r] | |
115 for p in rv.keys(): | |
116 pv=rv[p] | |
117 for k,v in pv.items(): | |
118 for s,c in v.items(): | |
119 print(r,end=EMPTY) | |
120 if p is None: | |
121 print(EMPTY,end='\t') | |
122 else: | |
123 print('.',p,sep=EMPTY,end='\t') | |
124 print(k,end='\t') | |
125 print(s,c,sep='\t') | |
126 | |
127 def dump_tuple(res): | |
128 for (r,p,k,s),c in res.items(): | |
129 print(r,end=EMPTY) | |
130 # The following assumes paths are always either length 1 or length 2!!! | |
131 # by open-coding rather than using qq(p) | |
132 if p is None: | |
133 print(EMPTY,end='\t') | |
134 else: | |
135 print(DOT,p,sep=EMPTY,end='\t') | |
136 print(k,end='\t') | |
137 print(s,c,sep='\t') | |
138 | |
139 def dump_concat(res): | |
140 for ks,c in res.items(): | |
141 (r,p,k,s)=ks.split(SEP) | |
142 print(r,end=EMPTY) | |
143 # The following assumes paths are always either length 1 or length 2!!! | |
144 # by open-coding rather than using qq(p) | |
145 if p==EMPTY: | |
146 print(EMPTY,end='\t') | |
147 else: | |
148 print('.',p,sep=EMPTY,end='\t') | |
149 print(k,end='\t') | |
150 print(s,c,sep='\t') | |
151 | |
152 if len(sys.argv)==2: | |
153 res=dict() | |
154 if sys.argv[1]=='1': | |
155 pp=pp_tuple | |
156 dump=dump_tuple | |
157 else: | |
158 pp=pp_concat | |
159 dump=dump_concat | |
160 else: | |
161 res=dict((r,dict()) for r in PATHS.keys()) | |
162 | |
52 def main(): | 163 def main(): |
53 global n,res # for debugging | 164 global n # for debugging |
54 n=0 | 165 n=0 |
55 res=dict((r,dict()) for r in PATHS.keys()) | |
56 for l in sys.stdin: | 166 for l in sys.stdin: |
57 if l[0]=='{' and '"WARC-Type":"response"' in l: | 167 if l[0]=='{' and '"WARC-Type":"response"' in l: |
58 j=json.loads(l) | 168 j=json.loads(l) |
59 n+=1 | 169 n+=1 |
60 for s in META_PATH: | 170 for s in META_PATH: |
68 continue | 178 continue |
69 walk(p,pp,k) | 179 walk(p,pp,k) |
70 | 180 |
71 print(n,file=sys.stderr) | 181 print(n,file=sys.stderr) |
72 | 182 |
73 for r in res.keys(): | 183 if dictRes: |
74 rv=res[r] | 184 print('res=',end=EMPTY) |
75 for p in rv.keys(): | 185 from pprint import pprint |
76 pv=rv[p] | 186 pprint(res) |
77 for k,v in pv.items(): | 187 else: |
78 for s,c in v.items(): | 188 dump(res) |
79 print(r,end='') | |
80 # The following assumes paths are always either length 1 or length 2!!! | |
81 # by open-coding rather than using qq(p) | |
82 if p is None: | |
83 print('',end='\t') | |
84 else: | |
85 assert p[0] is None | |
86 print('.',p[1],sep='',end='\t') | |
87 print(k,end='\t') | |
88 print(s,c,sep='\t') | |
89 | 189 |
90 def qq(p): | 190 def qq(p): |
91 if p is None: | 191 if p is None: |
92 sys.stdout.write('\t') | 192 sys.stdout.write('\t') |
93 else: | 193 else: |