Mercurial > hg > cc > azure
annotate master/src/wecu/sac_schemes.py @ 62:892e1c0240e1
added more robust (I hope) error handling,
got reducer working with support for choosing dict or tsv output
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 02 Jun 2020 17:35:07 +0000 |
parents | cfaf5223b071 |
children | d46c8b12fc04 |
rev | line source |
---|---|
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 import sys, json, regex |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 from collections.abc import Iterable |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 PATHS={'hdr':['Headers'], |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 'head':['HTML-Metadata','Head'], |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 'body':['HTML-Metadata','Links']} |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
16 def walk(o,f,r,path=None): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
17 '''Apply f to every key+leaf of a json object in region r''' |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 if isinstance(o,dict): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 for k,v in o.items(): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 if isinstance(v,dict): |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
21 walk(v,f,r,(path,k)) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 elif isinstance(v,Iterable): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 walked=False |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 for i in v: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 if isinstance(i,dict): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 if (not walked) and (i is not v[0]): |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
27 print('oops',key,path,k,i,file=sys.stderr) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 walked=True |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
29 walk(i,f,r,(path,k)) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 elif walked: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
31 print('oops2',key,path,k,i,file=sys.stderr) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 if not walked: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
33 f(v,k,path,r) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 else: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
35 f(v,k,path,r) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 elif isinstance(o,Iterable): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 for i in o: |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
38 walk(i,f,r,path) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
40 def pp(v,k,p,r): |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 if isinstance(v,str): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 m=SCHEME.match(v) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 if m is not None: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 n=URN.match(v) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 if n is not None: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 m=n |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
47 s=m.group(1) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
48 d=res[r].setdefault(p,dict()) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
49 d=d.setdefault(k,dict()) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
50 d[s]=d.get(s,0)+1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
51 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
52 def main(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
53 global n,res # for debugging |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
54 n=0 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
55 res=dict((r,dict()) for r in PATHS.keys()) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
56 for l in sys.stdin: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
57 if l[0]=='{' and '"WARC-Type":"response"' in l: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
58 j=json.loads(l) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
59 n+=1 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
60 for s in META_PATH: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
61 j=j[s] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
62 for k,v in PATHS.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
63 p=j |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
64 try: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
65 for s in v: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
66 p=p[s] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
67 except KeyError as e: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
68 continue |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
69 walk(p,pp,k) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
70 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
71 print(n,file=sys.stderr) |
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
72 |
62
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
73 for r in res.keys(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
74 rv=res[r] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
75 for p in rv.keys(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
76 pv=rv[p] |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
77 for k,v in pv.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
78 for s,c in v.items(): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
79 print(r,end='') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
80 # The following assumes paths are always either length 1 or length 2!!! |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
81 # by open-coding rather than using qq(p) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
82 if p is None: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
83 print('',end='\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
84 else: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
85 assert p[0] is None |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
86 print('.',p[1],sep='',end='\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
87 print(k,end='\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
88 print(s,c,sep='\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
89 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
90 def qq(p): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
91 if p is None: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
92 sys.stdout.write('\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
93 else: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
94 qq1(p[0]) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
95 print(p[1],end='\t') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
96 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
97 def qq1(p): |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
98 if p is None: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
99 return |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
100 else: |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
101 qq1(p[0]) |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
102 print(p[1],end='.') |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
103 |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
104 if __name__=="__main__": |
892e1c0240e1
added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents:
61
diff
changeset
|
105 main() |