comparison master/src/wecu/sac_schemes.py @ 66:b04870ab3035

don't over-count duplicate URIs in multiple properties, produce composite keys instead
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 16:10:55 +0000
parents d46c8b12fc04
children 13182e98a1ab
comparison
equal deleted inserted replaced
65:e1f61f94b196 66:b04870ab3035
7 [absent]: three nested dictionaries 7 [absent]: three nested dictionaries
8 1: one dictionary indexed by 4-tuple 8 1: one dictionary indexed by 4-tuple
9 2: one dictionary indexed by ".".join(keys)''' 9 2: one dictionary indexed by ".".join(keys)'''
10 10
11 import sys, json, regex 11 import sys, json, regex
12 from collections.abc import Iterable
13 12
14 if len(sys.argv)>1 and sys.argv[1]=='-d': 13 if len(sys.argv)>1 and sys.argv[1]=='-d':
15 sys.argv.pop(1) 14 sys.argv.pop(1)
16 dictRes=True 15 dictRes=True
17 else: 16 else:
26 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):') 25 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
27 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I) 26 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
28 27
29 EMPTY='' 28 EMPTY=''
30 29
30 D={}
31
31 def walk(o,f,r,path=None): 32 def walk(o,f,r,path=None):
32 '''Apply f to every key+leaf of a json object in region r''' 33 '''Apply f to every key+leaf of a json object reached via p in region r'''
33 if isinstance(o,dict): 34 if isinstance(o,dict):
34 for k,v in o.items(): 35 for k,v in o.items():
35 if isinstance(v,dict): 36 if isinstance(v,dict):
36 walk(v,f,r,(path,k)) 37 walk(v,f,r,(path,k))
37 elif isinstance(v,Iterable): 38 elif isinstance(v,(list,tuple)):
38 walked=False 39 walked=False
39 for i in v: 40 for i in v:
40 if isinstance(i,dict): 41 if isinstance(i,dict):
41 if (not walked) and (i is not v[0]): 42 if (not walked) and (i is not v[0]):
42 print('oops',key,path,k,i,file=sys.stderr) 43 print('oops',key,path,k,i,file=sys.stderr)
45 elif walked: 46 elif walked:
46 print('oops2',key,path,k,i,file=sys.stderr) 47 print('oops2',key,path,k,i,file=sys.stderr)
47 if not walked: 48 if not walked:
48 f(v,k,path,r) 49 f(v,k,path,r)
49 else: 50 else:
50 f(v,k,path,r) 51 kk=f(v,k,path,r,o)
51 elif isinstance(o,Iterable): 52 if kk is not None:
53 #print(v,D,kk,file=sys.stderr)
54 if v in D:
55 (rr,pp,jj,ss)=D[v]
56 D[v]=(rr,pp,(jj,k),ss)
57 else:
58 D[v]=kk
59 if D:
60 for kk in D.values():
61 res[kk]=res.get(kk,0)+1
62 D.clear()
63 elif isinstance(o,(list,tuple)):
52 for i in o: 64 for i in o:
53 walk(i,f,r,path) 65 walk(i,f,r,path)
54 66
55 def pp(v,k,p,r): 67 def pp(v,k,p,r,parent=None):
56 '''Uses nested dictionaries''' 68 '''Handle a leaf value v, with key k in parent, under path p from r
69 Uses nested dictionaries'''
57 if isinstance(v,str): 70 if isinstance(v,str):
58 m=SCHEME.match(v) 71 m=SCHEME.match(v)
59 if m is not None: 72 if m is not None:
60 n=URN.match(v) 73 n=URN.match(v)
61 if n is not None: 74 if n is not None:
68 p=p[1] 81 p=p[1]
69 d=res[r].setdefault(p,dict()) 82 d=res[r].setdefault(p,dict())
70 d=d.setdefault(k,dict()) 83 d=d.setdefault(k,dict())
71 d[s]=d.get(s,0)+1 84 d[s]=d.get(s,0)+1
72 85
73 def pp_tuple(v,k,p,r): 86 def pp_tuple(v,k,p,r,parent=None):
74 '''Uses one dict and 4-tuple''' 87 '''Handle a leaf value v, with key k in parent, under path p from r
88 Uses one dict and 4-tuple'''
75 if isinstance(v,str): 89 if isinstance(v,str):
76 m=SCHEME.match(v) 90 m=SCHEME.match(v)
77 if m is not None: 91 if m is not None:
78 n=URN.match(v) 92 n=URN.match(v)
79 if n is not None: 93 if n is not None:
82 # The following assumes paths are always either length 1 or length 2!!! 96 # The following assumes paths are always either length 1 or length 2!!!
83 # by open-coding rather than using qq(p) 97 # by open-coding rather than using qq(p)
84 if p is not None: 98 if p is not None:
85 assert p[0] is None 99 assert p[0] is None
86 p=p[1] 100 p=p[1]
87 k=(r,p,k,s) 101 if parent is None:
88 res[k]=res.get(k,0)+1 102 res[kk]=res.get(kk,0)+1
103 else:
104 return (r,p,k,s)
105
89 106
90 SEP='\x00' 107 SEP='\x00'
91 DOT='.' 108 DOT='.'
92 109
93 def pp_concat(v,k,p,r): 110 def pp_concat(v,k,p,r,parent=None):
94 '''Uses one dict and one string''' 111 '''Handle a leaf value v, with key k in parent, under path p from r
112 Uses one dict and one string'''
95 if isinstance(v,str): 113 if isinstance(v,str):
96 m=SCHEME.match(v) 114 m=SCHEME.match(v)
97 if m is not None: 115 if m is not None:
98 n=URN.match(v) 116 n=URN.match(v)
99 if n is not None: 117 if n is not None:
131 # by open-coding rather than using qq(p) 149 # by open-coding rather than using qq(p)
132 if p is None: 150 if p is None:
133 print(EMPTY,end='\t') 151 print(EMPTY,end='\t')
134 else: 152 else:
135 print(DOT,p,sep=EMPTY,end='\t') 153 print(DOT,p,sep=EMPTY,end='\t')
154 while isinstance(k,tuple):
155 print(k[1],end='&')
156 k=k[0]
136 print(k,end='\t') 157 print(k,end='\t')
137 print(s,c,sep='\t') 158 print(s,c,sep='\t')
138 159
139 def dump_concat(res): 160 def dump_concat(res):
140 for ks,c in res.items(): 161 for ks,c in res.items():
150 print(s,c,sep='\t') 171 print(s,c,sep='\t')
151 172
152 if len(sys.argv)==2: 173 if len(sys.argv)==2:
153 res=dict() 174 res=dict()
154 if sys.argv[1]=='1': 175 if sys.argv[1]=='1':
176 print('using tuple',file=sys.stderr)
155 pp=pp_tuple 177 pp=pp_tuple
156 dump=dump_tuple 178 dump=dump_tuple
157 else: 179 else:
180 print('using concat',file=sys.stderr)
158 pp=pp_concat 181 pp=pp_concat
159 dump=dump_concat 182 dump=dump_concat
160 else: 183 else:
184 print('using nested',file=sys.stderr)
161 res=dict((r,dict()) for r in PATHS.keys()) 185 res=dict((r,dict()) for r in PATHS.keys())
162 186
163 def main(): 187 def main():
164 global n # for debugging 188 global n # for debugging
165 n=0 189 n=0