annotate master/src/wecu/sac_schemes.py @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents
children 892e1c0240e1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
61
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/python3
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
3
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
4 import sys, json, regex
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
5 from collections.abc import Iterable
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
6
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
8
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
9 PATHS={'hdr':['Headers'],
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
10 'head':['HTML-Metadata','Head'],
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
11 'body':['HTML-Metadata','Links']}
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
12
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
15
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
16 def walk(o,f,path=""):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
17 '''Apply f to every key+leaf of a json object'''
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
18 if isinstance(o,dict):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
19 for k,v in o.items():
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
20 if isinstance(v,dict):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
21 walk(v,f,"%s.%s"%(path,k))
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
22 elif isinstance(v,Iterable):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
23 walked=False
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
24 for i in v:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
25 if isinstance(i,dict):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
26 if (not walked) and (i is not v[0]):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
27 print('oops',path,k,i,file=sys.stderr)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
28 walked=True
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
29 walk(i,f,"%s.%s"%(path,k))
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
30 elif walked:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
31 print('oops2',path,k,i,file=sys.stderr)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
32 if not walked:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
33 f(k,v,"%s.%s"%(path,k))
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
34 else:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
35 f(k,v,"%s.%s"%(path,k))
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
36 elif isinstance(o,Iterable):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
37 for i in o:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
38 walk(i,f,path)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
39
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
40 def pp(k,v,p):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
41 if isinstance(v,str):
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
42 m=SCHEME.match(v)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
43 if m is not None:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
44 try:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
45 n=v.index('\n')
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
46 v=v[:n]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
47 except ValueError:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
48 pass
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
49 n=URN.match(v)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
50 if n is not None:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
51 m=n
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
52 print(p,m.group(1),sep='\t')
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
53
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
54 n=0
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
55 for l in sys.stdin:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
56 n+=1
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
57 if n%1000==0:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
58 print(int(n/1000),file=sys.stderr)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
59 if l[0]=='{' and '"WARC-Type":"response"' in l:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
60 j=json.loads(l)
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
61 for s in META_PATH:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
62 j=j[s]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
63 for k,v in PATHS.items():
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
64 p=j
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
65 try:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
66 for s in v:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
67 p=p[s]
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
68 except KeyError:
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
69 continue
cfaf5223b071 trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff changeset
70 walk(p,pp,k)