Mercurial > hg > cc > azure
annotate master/src/wecu/sac_schemes.py @ 61:cfaf5223b071
trying to get my own mapper working
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Sun, 31 May 2020 12:06:44 +0000 |
parents | |
children | 892e1c0240e1 |
rev | line source |
---|---|
61
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 '''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary''' |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 import sys, json, regex |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 from collections.abc import Iterable |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata'] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 PATHS={'hdr':['Headers'], |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 'head':['HTML-Metadata','Head'], |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 'body':['HTML-Metadata','Links']} |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
12 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
13 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
14 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
15 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
16 def walk(o,f,path=""): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
17 '''Apply f to every key+leaf of a json object''' |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
18 if isinstance(o,dict): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
19 for k,v in o.items(): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
20 if isinstance(v,dict): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
21 walk(v,f,"%s.%s"%(path,k)) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
22 elif isinstance(v,Iterable): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
23 walked=False |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
24 for i in v: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
25 if isinstance(i,dict): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
26 if (not walked) and (i is not v[0]): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
27 print('oops',path,k,i,file=sys.stderr) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
28 walked=True |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
29 walk(i,f,"%s.%s"%(path,k)) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
30 elif walked: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
31 print('oops2',path,k,i,file=sys.stderr) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
32 if not walked: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
33 f(k,v,"%s.%s"%(path,k)) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
34 else: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
35 f(k,v,"%s.%s"%(path,k)) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
36 elif isinstance(o,Iterable): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
37 for i in o: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
38 walk(i,f,path) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
39 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
40 def pp(k,v,p): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
41 if isinstance(v,str): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
42 m=SCHEME.match(v) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
43 if m is not None: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
44 try: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
45 n=v.index('\n') |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
46 v=v[:n] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
47 except ValueError: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
48 pass |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
49 n=URN.match(v) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
50 if n is not None: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
51 m=n |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
52 print(p,m.group(1),sep='\t') |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
53 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
54 n=0 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
55 for l in sys.stdin: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
56 n+=1 |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
57 if n%1000==0: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
58 print(int(n/1000),file=sys.stderr) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
59 if l[0]=='{' and '"WARC-Type":"response"' in l: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
60 j=json.loads(l) |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
61 for s in META_PATH: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
62 j=j[s] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
63 for k,v in PATHS.items(): |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
64 p=j |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
65 try: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
66 for s in v: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
67 p=p[s] |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
68 except KeyError: |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
69 continue |
cfaf5223b071
trying to get my own mapper working
Henry S. Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
70 walk(p,pp,k) |