diff master/src/wecu/sac_schemes.py @ 63:d46c8b12fc04

support multiple approaches to key combination, use local files to collect results
author Henry S. Thompson <ht@markup.co.uk>
date Wed, 03 Jun 2020 16:40:34 +0000
parents 892e1c0240e1
children b04870ab3035
line wrap: on
line diff
--- a/master/src/wecu/sac_schemes.py	Tue Jun 02 17:35:07 2020 +0000
+++ b/master/src/wecu/sac_schemes.py	Wed Jun 03 16:40:34 2020 +0000
@@ -1,9 +1,22 @@
 #!/usr/bin/python3
-'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''
+'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary
+
+Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme]
+
+where altStorageScheme if present selects an alternative approach to storing triple counts:
+  [absent]: three nested dictionaries
+         1: one dictionary indexed by 4-tuple
+         2: one dictionary indexed by ".".join(keys)'''
 
 import sys, json, regex
 from collections.abc import Iterable
 
+if len(sys.argv)>1 and sys.argv[1]=='-d':
+  sys.argv.pop(1)
+  dictRes=True
+else:
+  dictRes=False
+
 META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
 
 PATHS={'hdr':['Headers'],
@@ -13,6 +26,8 @@
 SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
 URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
 
+EMPTY=''
+
 def walk(o,f,r,path=None):
   '''Apply f to every key+leaf of a json object in region r'''
   if isinstance(o,dict):
@@ -38,6 +53,45 @@
       walk(i,f,r,path)
 
 def pp(v,k,p,r):
+  '''Uses nested dictionaries'''
+  if isinstance(v,str):
+    m=SCHEME.match(v)
+    if m is not None:
+      n=URN.match(v)
+      if n is not None:
+        m=n
+      s=m.group(1)
+      # The following assumes paths are always either length 1 or length 2!!!
+      #  by open-coding rather than using qq(p)
+      if p is not None:
+        assert p[0] is None
+        p=p[1]
+      d=res[r].setdefault(p,dict())
+      d=d.setdefault(k,dict())
+      d[s]=d.get(s,0)+1
+
+def pp_tuple(v,k,p,r):
+  '''Uses one dict and 4-tuple'''
+  if isinstance(v,str):
+    m=SCHEME.match(v)
+    if m is not None:
+      n=URN.match(v)
+      if n is not None:
+        m=n
+      s=m.group(1)
+      # The following assumes paths are always either length 1 or length 2!!!
+      #  by open-coding rather than using qq(p)
+      if p is not None:
+        assert p[0] is None
+        p=p[1]
+      k=(r,p,k,s)
+      res[k]=res.get(k,0)+1
+
+SEP='\x00'
+DOT='.'
+
+def pp_concat(v,k,p,r):
+  '''Uses one dict and one string'''
   if isinstance(v,str):
     m=SCHEME.match(v)
     if m is not None:
@@ -45,14 +99,70 @@
       if n is not None:
         m=n
       s=m.group(1)
-      d=res[r].setdefault(p,dict())
-      d=d.setdefault(k,dict())
-      d[s]=d.get(s,0)+1
+      # The following assumes paths are always either length 1 or length 2!!!
+      #  by open-coding rather than using qq(p)
+      if p is None:
+        p=EMPTY
+      else:
+        assert p[0] is None
+        p=p[1]
+      k=SEP.join((r,p,k,s))
+      res[k]=res.get(k,0)+1
+
+def dump(res):
+  for r in res.keys():
+    rv=res[r]
+    for p in rv.keys():
+      pv=rv[p]
+      for k,v in pv.items():
+        for s,c in v.items():
+          print(r,end=EMPTY)
+          if p is None:
+            print(EMPTY,end='\t')
+          else:
+            print('.',p,sep=EMPTY,end='\t')
+          print(k,end='\t')
+          print(s,c,sep='\t')
+
+def dump_tuple(res):
+  for (r,p,k,s),c in res.items():
+    print(r,end=EMPTY)
+    # The following assumes paths are always either length 1 or length 2!!!
+    #  by open-coding rather than using qq(p)
+    if p is None:
+      print(EMPTY,end='\t')
+    else:
+      print(DOT,p,sep=EMPTY,end='\t')
+    print(k,end='\t')
+    print(s,c,sep='\t')
+
+def dump_concat(res):
+  for ks,c in res.items():
+    (r,p,k,s)=ks.split(SEP)
+    print(r,end=EMPTY)
+    # The following assumes paths are always either length 1 or length 2!!!
+    #  by open-coding rather than using qq(p)
+    if p==EMPTY:
+      print(EMPTY,end='\t')
+    else:
+      print('.',p,sep=EMPTY,end='\t')
+    print(k,end='\t')
+    print(s,c,sep='\t')
+
+if len(sys.argv)==2:
+  res=dict()
+  if sys.argv[1]=='1':
+    pp=pp_tuple 
+    dump=dump_tuple
+  else:
+    pp=pp_concat
+    dump=dump_concat
+else:
+  res=dict((r,dict()) for r in PATHS.keys())
 
 def main():
-  global n,res # for debugging
+  global n # for debugging
   n=0
-  res=dict((r,dict()) for r in PATHS.keys())
   for l in sys.stdin:
     if l[0]=='{' and '"WARC-Type":"response"' in l:
       j=json.loads(l)
@@ -70,22 +180,12 @@
 
   print(n,file=sys.stderr)
 
-  for r in res.keys():
-    rv=res[r]
-    for p in rv.keys():
-      pv=rv[p]
-      for k,v in pv.items():
-        for s,c in v.items():
-          print(r,end='')
-          # The following assumes paths are always either length 1 or length 2!!!
-          #  by open-coding rather than using qq(p)
-          if p is None:
-            print('',end='\t')
-          else:
-            assert p[0] is None
-            print('.',p[1],sep='',end='\t')
-          print(k,end='\t')
-          print(s,c,sep='\t')
+  if dictRes:
+    print('res=',end=EMPTY)
+    from pprint import pprint
+    pprint(res)
+  else:
+    dump(res)
 
 def qq(p):
   if p is None: