Mercurial > hg > cc > azure
comparison master/src/wecu/sac_schemes.py @ 66:b04870ab3035
don't over-count duplicate URIs in multiple properties, produce composite keys instead
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Thu, 04 Jun 2020 16:10:55 +0000 |
parents | d46c8b12fc04 |
children | 13182e98a1ab |
comparison
equal
deleted
inserted
replaced
65:e1f61f94b196 | 66:b04870ab3035 |
---|---|
7 [absent]: three nested dictionaries | 7 [absent]: three nested dictionaries |
8 1: one dictionary indexed by 4-tuple | 8 1: one dictionary indexed by 4-tuple |
9 2: one dictionary indexed by ".".join(keys)''' | 9 2: one dictionary indexed by ".".join(keys)''' |
10 | 10 |
11 import sys, json, regex | 11 import sys, json, regex |
12 from collections.abc import Iterable | |
13 | 12 |
14 if len(sys.argv)>1 and sys.argv[1]=='-d': | 13 if len(sys.argv)>1 and sys.argv[1]=='-d': |
15 sys.argv.pop(1) | 14 sys.argv.pop(1) |
16 dictRes=True | 15 dictRes=True |
17 else: | 16 else: |
26 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') | 25 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') |
27 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) | 26 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) |
28 | 27 |
29 EMPTY='' | 28 EMPTY='' |
30 | 29 |
30 D={} | |
31 | |
31 def walk(o,f,r,path=None): | 32 def walk(o,f,r,path=None): |
32 '''Apply f to every key+leaf of a json object in region r''' | 33 '''Apply f to every key+leaf of a json object reached via p in region r''' |
33 if isinstance(o,dict): | 34 if isinstance(o,dict): |
34 for k,v in o.items(): | 35 for k,v in o.items(): |
35 if isinstance(v,dict): | 36 if isinstance(v,dict): |
36 walk(v,f,r,(path,k)) | 37 walk(v,f,r,(path,k)) |
37 elif isinstance(v,Iterable): | 38 elif isinstance(v,(list,tuple)): |
38 walked=False | 39 walked=False |
39 for i in v: | 40 for i in v: |
40 if isinstance(i,dict): | 41 if isinstance(i,dict): |
41 if (not walked) and (i is not v[0]): | 42 if (not walked) and (i is not v[0]): |
42 print('oops',key,path,k,i,file=sys.stderr) | 43 print('oops',key,path,k,i,file=sys.stderr) |
45 elif walked: | 46 elif walked: |
46 print('oops2',key,path,k,i,file=sys.stderr) | 47 print('oops2',key,path,k,i,file=sys.stderr) |
47 if not walked: | 48 if not walked: |
48 f(v,k,path,r) | 49 f(v,k,path,r) |
49 else: | 50 else: |
50 f(v,k,path,r) | 51 kk=f(v,k,path,r,o) |
51 elif isinstance(o,Iterable): | 52 if kk is not None: |
53 #print(v,D,kk,file=sys.stderr) | |
54 if v in D: | |
55 (rr,pp,jj,ss)=D[v] | |
56 D[v]=(rr,pp,(jj,k),ss) | |
57 else: | |
58 D[v]=kk | |
59 if D: | |
60 for kk in D.values(): | |
61 res[kk]=res.get(kk,0)+1 | |
62 D.clear() | |
63 elif isinstance(o,(list,tuple)): | |
52 for i in o: | 64 for i in o: |
53 walk(i,f,r,path) | 65 walk(i,f,r,path) |
54 | 66 |
55 def pp(v,k,p,r): | 67 def pp(v,k,p,r,parent=None): |
56 '''Uses nested dictionaries''' | 68 '''Handle a leaf value v, with key k in parent, under path p from r |
69 Uses nested dictionaries''' | |
57 if isinstance(v,str): | 70 if isinstance(v,str): |
58 m=SCHEME.match(v) | 71 m=SCHEME.match(v) |
59 if m is not None: | 72 if m is not None: |
60 n=URN.match(v) | 73 n=URN.match(v) |
61 if n is not None: | 74 if n is not None: |
68 p=p[1] | 81 p=p[1] |
69 d=res[r].setdefault(p,dict()) | 82 d=res[r].setdefault(p,dict()) |
70 d=d.setdefault(k,dict()) | 83 d=d.setdefault(k,dict()) |
71 d[s]=d.get(s,0)+1 | 84 d[s]=d.get(s,0)+1 |
72 | 85 |
73 def pp_tuple(v,k,p,r): | 86 def pp_tuple(v,k,p,r,parent=None): |
74 '''Uses one dict and 4-tuple''' | 87 '''Handle a leaf value v, with key k in parent, under path p from r |
88 Uses one dict and 4-tuple''' | |
75 if isinstance(v,str): | 89 if isinstance(v,str): |
76 m=SCHEME.match(v) | 90 m=SCHEME.match(v) |
77 if m is not None: | 91 if m is not None: |
78 n=URN.match(v) | 92 n=URN.match(v) |
79 if n is not None: | 93 if n is not None: |
82 # The following assumes paths are always either length 1 or length 2!!! | 96 # The following assumes paths are always either length 1 or length 2!!! |
83 # by open-coding rather than using qq(p) | 97 # by open-coding rather than using qq(p) |
84 if p is not None: | 98 if p is not None: |
85 assert p[0] is None | 99 assert p[0] is None |
86 p=p[1] | 100 p=p[1] |
87 k=(r,p,k,s) | 101 if parent is None: |
88 res[k]=res.get(k,0)+1 | 102 res[kk]=res.get(kk,0)+1 |
103 else: | |
104 return (r,p,k,s) | |
105 | |
89 | 106 |
90 SEP='\x00' | 107 SEP='\x00' |
91 DOT='.' | 108 DOT='.' |
92 | 109 |
93 def pp_concat(v,k,p,r): | 110 def pp_concat(v,k,p,r,parent=None): |
94 '''Uses one dict and one string''' | 111 '''Handle a leaf value v, with key k in parent, under path p from r |
112 Uses one dict and one string''' | |
95 if isinstance(v,str): | 113 if isinstance(v,str): |
96 m=SCHEME.match(v) | 114 m=SCHEME.match(v) |
97 if m is not None: | 115 if m is not None: |
98 n=URN.match(v) | 116 n=URN.match(v) |
99 if n is not None: | 117 if n is not None: |
131 # by open-coding rather than using qq(p) | 149 # by open-coding rather than using qq(p) |
132 if p is None: | 150 if p is None: |
133 print(EMPTY,end='\t') | 151 print(EMPTY,end='\t') |
134 else: | 152 else: |
135 print(DOT,p,sep=EMPTY,end='\t') | 153 print(DOT,p,sep=EMPTY,end='\t') |
154 while isinstance(k,tuple): | |
155 print(k[1],end='&') | |
156 k=k[0] | |
136 print(k,end='\t') | 157 print(k,end='\t') |
137 print(s,c,sep='\t') | 158 print(s,c,sep='\t') |
138 | 159 |
139 def dump_concat(res): | 160 def dump_concat(res): |
140 for ks,c in res.items(): | 161 for ks,c in res.items(): |
150 print(s,c,sep='\t') | 171 print(s,c,sep='\t') |
151 | 172 |
152 if len(sys.argv)==2: | 173 if len(sys.argv)==2: |
153 res=dict() | 174 res=dict() |
154 if sys.argv[1]=='1': | 175 if sys.argv[1]=='1': |
176 print('using tuple',file=sys.stderr) | |
155 pp=pp_tuple | 177 pp=pp_tuple |
156 dump=dump_tuple | 178 dump=dump_tuple |
157 else: | 179 else: |
180 print('using concat',file=sys.stderr) | |
158 pp=pp_concat | 181 pp=pp_concat |
159 dump=dump_concat | 182 dump=dump_concat |
160 else: | 183 else: |
184 print('using nested',file=sys.stderr) | |
161 res=dict((r,dict()) for r in PATHS.keys()) | 185 res=dict((r,dict()) for r in PATHS.keys()) |
162 | 186 |
163 def main(): | 187 def main(): |
164 global n # for debugging | 188 global n # for debugging |
165 n=0 | 189 n=0 |