changeset 66:b04870ab3035

don't over-count duplicate URIs in multiple properties, produce composite keys instead
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 16:10:55 +0000
parents e1f61f94b196
children 13182e98a1ab
files master/src/wecu/sac_schemes.py
diffstat 1 files changed, 37 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/master/src/wecu/sac_schemes.py	Thu Jun 04 12:08:29 2020 +0000
+++ b/master/src/wecu/sac_schemes.py	Thu Jun 04 16:10:55 2020 +0000
@@ -9,7 +9,6 @@
          2: one dictionary indexed by ".".join(keys)'''
 
 import sys, json, regex
-from collections.abc import Iterable
 
 if len(sys.argv)>1 and sys.argv[1]=='-d':
   sys.argv.pop(1)
@@ -28,13 +27,15 @@
 
 EMPTY=''
 
+D={}
+
 def walk(o,f,r,path=None):
-  '''Apply f to every key+leaf of a json object in region r'''
+  '''Apply f to every key+leaf of a json object reached via p in region r'''
   if isinstance(o,dict):
     for k,v in o.items():
       if isinstance(v,dict):
         walk(v,f,r,(path,k))
-      elif isinstance(v,Iterable):
+      elif isinstance(v,(list,tuple)):
         walked=False
         for i in v:
           if isinstance(i,dict):
@@ -47,13 +48,25 @@
         if not walked:
           f(v,k,path,r)
       else:
-        f(v,k,path,r)
-  elif isinstance(o,Iterable):
+        kk=f(v,k,path,r,o)
+        if kk is not None:
+          #print(v,D,kk,file=sys.stderr)
+          if v in D:
+            (rr,pp,jj,ss)=D[v]
+            D[v]=(rr,pp,(jj,k),ss)
+          else:
+            D[v]=kk
+    if D:
+      for kk in D.values():
+        res[kk]=res.get(kk,0)+1
+      D.clear()
+  elif isinstance(o,(list,tuple)):
     for i in o:
       walk(i,f,r,path)
 
-def pp(v,k,p,r):
-  '''Uses nested dictionaries'''
+def pp(v,k,p,r,parent=None):
+  '''Handle a leaf value v, with key k in parent, under path p from r
+  Uses nested dictionaries'''
   if isinstance(v,str):
     m=SCHEME.match(v)
     if m is not None:
@@ -70,8 +83,9 @@
       d=d.setdefault(k,dict())
       d[s]=d.get(s,0)+1
 
-def pp_tuple(v,k,p,r):
-  '''Uses one dict and 4-tuple'''
+def pp_tuple(v,k,p,r,parent=None):
+  '''Handle a leaf value v, with key k in parent, under path p from r
+  Uses one dict and 4-tuple'''
   if isinstance(v,str):
     m=SCHEME.match(v)
     if m is not None:
@@ -84,14 +98,18 @@
       if p is not None:
         assert p[0] is None
         p=p[1]
-      k=(r,p,k,s)
-      res[k]=res.get(k,0)+1
+      if parent is None:
+        res[kk]=res.get(kk,0)+1
+      else:
+        return (r,p,k,s)
+
 
 SEP='\x00'
 DOT='.'
 
-def pp_concat(v,k,p,r):
-  '''Uses one dict and one string'''
+def pp_concat(v,k,p,r,parent=None):
+  '''Handle a leaf value v, with key k in parent, under path p from r
+  Uses one dict and one string'''
   if isinstance(v,str):
     m=SCHEME.match(v)
     if m is not None:
@@ -133,6 +151,9 @@
       print(EMPTY,end='\t')
     else:
       print(DOT,p,sep=EMPTY,end='\t')
+    while isinstance(k,tuple):
+      print(k[1],end='&')
+      k=k[0]
     print(k,end='\t')
     print(s,c,sep='\t')
 
@@ -152,12 +173,15 @@
 if len(sys.argv)==2:
   res=dict()
   if sys.argv[1]=='1':
+    print('using tuple',file=sys.stderr)
     pp=pp_tuple 
     dump=dump_tuple
   else:
+    print('using concat',file=sys.stderr)
     pp=pp_concat
     dump=dump_concat
 else:
+  print('using nested',file=sys.stderr)
   res=dict((r,dict()) for r in PATHS.keys())
 
 def main():