diff master/src/wecu/sac_schemes.py @ 62:892e1c0240e1

added more robust (I hope) error handling, got reducer working with support for choosing dict or tsv output
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 02 Jun 2020 17:35:07 +0000
parents cfaf5223b071
children d46c8b12fc04
line wrap: on
line diff
--- a/master/src/wecu/sac_schemes.py	Sun May 31 12:06:44 2020 +0000
+++ b/master/src/wecu/sac_schemes.py	Tue Jun 02 17:35:07 2020 +0000
@@ -13,58 +13,93 @@
 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
 
-def walk(o,f,path=""):
-  '''Apply f to every key+leaf of a json object'''
+def walk(o,f,r,path=None):
+  '''Apply f to every key+leaf of a json object in region r'''
   if isinstance(o,dict):
     for k,v in o.items():
       if isinstance(v,dict):
-        walk(v,f,"%s.%s"%(path,k))
+        walk(v,f,r,(path,k))
       elif isinstance(v,Iterable):
         walked=False
         for i in v:
           if isinstance(i,dict):
             if (not walked) and (i is not v[0]):
-              print('oops',path,k,i,file=sys.stderr)
+              print('oops',key,path,k,i,file=sys.stderr)
             walked=True
-            walk(i,f,"%s.%s"%(path,k))
+            walk(i,f,r,(path,k))
           elif walked:
-            print('oops2',path,k,i,file=sys.stderr)
+            print('oops2',key,path,k,i,file=sys.stderr)
         if not walked:
-          f(k,v,"%s.%s"%(path,k))
+          f(v,k,path,r)
       else:
-        f(k,v,"%s.%s"%(path,k))
+        f(v,k,path,r)
   elif isinstance(o,Iterable):
     for i in o:
-      walk(i,f,path)
+      walk(i,f,r,path)
 
-def pp(k,v,p):
+def pp(v,k,p,r):
   if isinstance(v,str):
     m=SCHEME.match(v)
     if m is not None:
-      try:
-        n=v.index('\n')
-        v=v[:n]
-      except ValueError:
-        pass
       n=URN.match(v)
       if n is not None:
         m=n
-      print(p,m.group(1),sep='\t')
+      s=m.group(1)
+      d=res[r].setdefault(p,dict())
+      d=d.setdefault(k,dict())
+      d[s]=d.get(s,0)+1
+
+def main():
+  global n,res # for debugging
+  n=0
+  res=dict((r,dict()) for r in PATHS.keys())
+  for l in sys.stdin:
+    if l[0]=='{' and '"WARC-Type":"response"' in l:
+      j=json.loads(l)
+      n+=1
+      for s in META_PATH:
+        j=j[s]
+      for k,v in PATHS.items():
+        p=j
+        try:
+          for s in v:
+            p=p[s]
+        except KeyError as e:
+          continue
+        walk(p,pp,k)
+
+  print(n,file=sys.stderr)
 
-n=0
-for l in sys.stdin:
-  n+=1
-  if n%1000==0:
-    print(int(n/1000),file=sys.stderr)
-  if l[0]=='{' and '"WARC-Type":"response"' in l:
-    j=json.loads(l)
-    for s in META_PATH:
-      j=j[s]
-    for k,v in PATHS.items():
-      p=j
-      try:
-        for s in v:
-          p=p[s]
-      except KeyError:
-        continue
-      walk(p,pp,k)
+  for r in res.keys():
+    rv=res[r]
+    for p in rv.keys():
+      pv=rv[p]
+      for k,v in pv.items():
+        for s,c in v.items():
+          print(r,end='')
+          # The following assumes paths are always either length 1 or length 2!!!
+          #  by open-coding rather than using qq(p)
+          if p is None:
+            print('',end='\t')
+          else:
+            assert p[0] is None
+            print('.',p[1],sep='',end='\t')
+          print(k,end='\t')
+          print(s,c,sep='\t')
+
+def qq(p):
+  if p is None:
+    sys.stdout.write('\t')
+  else:
+    qq1(p[0])
+    print(p[1],end='\t')
+
+def qq1(p):
+  if p is None:
+    return
+  else:
+    qq1(p[0])
+    print(p[1],end='.')
+
+if __name__=="__main__":
+  main()