annotate master/src/wecu/sac_schemes.py @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/python3
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 import sys, json, regex
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 from collections.abc import Iterable
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 PATHS={'hdr':['Headers'],
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 'head':['HTML-Metadata','Head'],
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 'body':['HTML-Metadata','Links']}
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
16 def walk(o,f,r,path=None):
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
17 '''Apply f to every key+leaf of a json object in region r'''
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 if isinstance(o,dict):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 for k,v in o.items():
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 if isinstance(v,dict):
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
21 walk(v,f,r,(path,k))
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 elif isinstance(v,Iterable):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 walked=False
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24 for i in v:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25 if isinstance(i,dict):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 if (not walked) and (i is not v[0]):
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
27 print('oops',key,path,k,i,file=sys.stderr)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 walked=True
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
29 walk(i,f,r,(path,k))
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 elif walked:
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
31 print('oops2',key,path,k,i,file=sys.stderr)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 if not walked:
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
33 f(v,k,path,r)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 else:
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
35 f(v,k,path,r)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 elif isinstance(o,Iterable):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 for i in o:
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
38 walk(i,f,r,path)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
40 def pp(v,k,p,r):
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
41 if isinstance(v,str):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
42 m=SCHEME.match(v)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
43 if m is not None:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 n=URN.match(v)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45 if n is not None:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 m=n
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
47 s=m.group(1)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
48 d=res[r].setdefault(p,dict())
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
49 d=d.setdefault(k,dict())
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
50 d[s]=d.get(s,0)+1
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
51
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
52 def main():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
53 global n,res # for debugging
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
54 n=0
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
55 res=dict((r,dict()) for r in PATHS.keys())
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
56 for l in sys.stdin:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
57 if l[0]=='{' and '"WARC-Type":"response"' in l:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
58 j=json.loads(l)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
59 n+=1
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
60 for s in META_PATH:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
61 j=j[s]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
62 for k,v in PATHS.items():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
63 p=j
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
64 try:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
65 for s in v:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
66 p=p[s]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
67 except KeyError as e:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
68 continue
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
69 walk(p,pp,k)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
70
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
71 print(n,file=sys.stderr)
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
72
62
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
73 for r in res.keys():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
74 rv=res[r]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
75 for p in rv.keys():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
76 pv=rv[p]
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
77 for k,v in pv.items():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
78 for s,c in v.items():
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
79 print(r,end='')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
80 # The following assumes paths are always either length 1 or length 2!!!
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
81 # by open-coding rather than using qq(p)
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
82 if p is None:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
83 print('',end='\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
84 else:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
85 assert p[0] is None
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
86 print('.',p[1],sep='',end='\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
87 print(k,end='\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
88 print(s,c,sep='\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
89
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
90 def qq(p):
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
91 if p is None:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
92 sys.stdout.write('\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
93 else:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
94 qq1(p[0])
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
95 print(p[1],end='\t')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
96
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
97 def qq1(p):
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
98 if p is None:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
99 return
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
100 else:
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
101 qq1(p[0])
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
102 print(p[1],end='.')
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
103
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
104 if __name__=="__main__":
892e1c0240e1 added more robust (I hope) error handling,
Henry S. Thompson <ht@markup.co.uk>
parents: 61
diff changeset
105 main()