view master/src/wecu/sac_schemes.py @ 66:b04870ab3035

don't over-count duplicate URIs in multiple properties, produce composite keys instead
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 04 Jun 2020 16:10:55 +0000
parents d46c8b12fc04
children 13182e98a1ab
line wrap: on
line source

#!/usr/bin/python3
'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary

Usage: uz ...wat.gz | sac_schemes.py [-d] [altStorageScheme]

where altStorageScheme if present selects an alternative approach to storing triple counts:
  [absent]: three nested dictionaries
         1: one dictionary indexed by 4-tuple
         2: one dictionary indexed by ".".join(keys)'''

import sys, json, regex

if len(sys.argv)>1 and sys.argv[1]=='-d':
  sys.argv.pop(1)
  dictRes=True
else:
  dictRes=False

META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']

PATHS={'hdr':['Headers'],
       'head':['HTML-Metadata','Head'],
       'body':['HTML-Metadata','Links']}

SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)

EMPTY=''

D={}

def walk(o,f,r,path=None):
  '''Apply f to every key+leaf of a json object reached via p in region r'''
  if isinstance(o,dict):
    for k,v in o.items():
      if isinstance(v,dict):
        walk(v,f,r,(path,k))
      elif isinstance(v,(list,tuple)):
        walked=False
        for i in v:
          if isinstance(i,dict):
            if (not walked) and (i is not v[0]):
              print('oops',key,path,k,i,file=sys.stderr)
            walked=True
            walk(i,f,r,(path,k))
          elif walked:
            print('oops2',key,path,k,i,file=sys.stderr)
        if not walked:
          f(v,k,path,r)
      else:
        kk=f(v,k,path,r,o)
        if kk is not None:
          #print(v,D,kk,file=sys.stderr)
          if v in D:
            (rr,pp,jj,ss)=D[v]
            D[v]=(rr,pp,(jj,k),ss)
          else:
            D[v]=kk
    if D:
      for kk in D.values():
        res[kk]=res.get(kk,0)+1
      D.clear()
  elif isinstance(o,(list,tuple)):
    for i in o:
      walk(i,f,r,path)

def pp(v,k,p,r,parent=None):
  '''Handle a leaf value v, with key k in parent, under path p from r
  Uses nested dictionaries'''
  if isinstance(v,str):
    m=SCHEME.match(v)
    if m is not None:
      n=URN.match(v)
      if n is not None:
        m=n
      s=m.group(1)
      # The following assumes paths are always either length 1 or length 2!!!
      #  by open-coding rather than using qq(p)
      if p is not None:
        assert p[0] is None
        p=p[1]
      d=res[r].setdefault(p,dict())
      d=d.setdefault(k,dict())
      d[s]=d.get(s,0)+1

def pp_tuple(v,k,p,r,parent=None):
  '''Handle a leaf value v, with key k in parent, under path p from r
  Uses one dict and 4-tuple'''
  if isinstance(v,str):
    m=SCHEME.match(v)
    if m is not None:
      n=URN.match(v)
      if n is not None:
        m=n
      s=m.group(1)
      # The following assumes paths are always either length 1 or length 2!!!
      #  by open-coding rather than using qq(p)
      if p is not None:
        assert p[0] is None
        p=p[1]
      if parent is None:
        res[kk]=res.get(kk,0)+1
      else:
        return (r,p,k,s)


SEP='\x00'
DOT='.'

def pp_concat(v,k,p,r,parent=None):
  '''Handle a leaf value v, with key k in parent, under path p from r
  Uses one dict and one string'''
  if isinstance(v,str):
    m=SCHEME.match(v)
    if m is not None:
      n=URN.match(v)
      if n is not None:
        m=n
      s=m.group(1)
      # The following assumes paths are always either length 1 or length 2!!!
      #  by open-coding rather than using qq(p)
      if p is None:
        p=EMPTY
      else:
        assert p[0] is None
        p=p[1]
      k=SEP.join((r,p,k,s))
      res[k]=res.get(k,0)+1

def dump(res):
  for r in res.keys():
    rv=res[r]
    for p in rv.keys():
      pv=rv[p]
      for k,v in pv.items():
        for s,c in v.items():
          print(r,end=EMPTY)
          if p is None:
            print(EMPTY,end='\t')
          else:
            print('.',p,sep=EMPTY,end='\t')
          print(k,end='\t')
          print(s,c,sep='\t')

def dump_tuple(res):
  for (r,p,k,s),c in res.items():
    print(r,end=EMPTY)
    # The following assumes paths are always either length 1 or length 2!!!
    #  by open-coding rather than using qq(p)
    if p is None:
      print(EMPTY,end='\t')
    else:
      print(DOT,p,sep=EMPTY,end='\t')
    while isinstance(k,tuple):
      print(k[1],end='&')
      k=k[0]
    print(k,end='\t')
    print(s,c,sep='\t')

def dump_concat(res):
  for ks,c in res.items():
    (r,p,k,s)=ks.split(SEP)
    print(r,end=EMPTY)
    # The following assumes paths are always either length 1 or length 2!!!
    #  by open-coding rather than using qq(p)
    if p==EMPTY:
      print(EMPTY,end='\t')
    else:
      print('.',p,sep=EMPTY,end='\t')
    print(k,end='\t')
    print(s,c,sep='\t')

if len(sys.argv)==2:
  res=dict()
  if sys.argv[1]=='1':
    print('using tuple',file=sys.stderr)
    pp=pp_tuple 
    dump=dump_tuple
  else:
    print('using concat',file=sys.stderr)
    pp=pp_concat
    dump=dump_concat
else:
  print('using nested',file=sys.stderr)
  res=dict((r,dict()) for r in PATHS.keys())

def main():
  global n # for debugging
  n=0
  for l in sys.stdin:
    if l[0]=='{' and '"WARC-Type":"response"' in l:
      j=json.loads(l)
      n+=1
      for s in META_PATH:
        j=j[s]
      for k,v in PATHS.items():
        p=j
        try:
          for s in v:
            p=p[s]
        except KeyError as e:
          continue
        walk(p,pp,k)

  print(n,file=sys.stderr)

  if dictRes:
    print('res=',end=EMPTY)
    from pprint import pprint
    pprint(res)
  else:
    dump(res)

def qq(p):
  if p is None:
    sys.stdout.write('\t')
  else:
    qq1(p[0])
    print(p[1],end='\t')

def qq1(p):
  if p is None:
    return
  else:
    qq1(p[0])
    print(p[1],end='.')

if __name__=="__main__":
  main()