view master/src/wecu/sac_schemes.py @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents
children 892e1c0240e1
line wrap: on
line source

#!/usr/bin/python3
'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''

import sys, json, regex
from collections.abc import Iterable

META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']

PATHS={'hdr':['Headers'],
       'head':['HTML-Metadata','Head'],
       'body':['HTML-Metadata','Links']}

SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)

def walk(o,f,path=""):
  '''Apply f to every key+leaf of a json object'''
  if isinstance(o,dict):
    for k,v in o.items():
      if isinstance(v,dict):
        walk(v,f,"%s.%s"%(path,k))
      elif isinstance(v,Iterable):
        walked=False
        for i in v:
          if isinstance(i,dict):
            if (not walked) and (i is not v[0]):
              print('oops',path,k,i,file=sys.stderr)
            walked=True
            walk(i,f,"%s.%s"%(path,k))
          elif walked:
            print('oops2',path,k,i,file=sys.stderr)
        if not walked:
          f(k,v,"%s.%s"%(path,k))
      else:
        f(k,v,"%s.%s"%(path,k))
  elif isinstance(o,Iterable):
    for i in o:
      walk(i,f,path)

def pp(k,v,p):
  if isinstance(v,str):
    m=SCHEME.match(v)
    if m is not None:
      try:
        n=v.index('\n')
        v=v[:n]
      except ValueError:
        pass
      n=URN.match(v)
      if n is not None:
        m=n
      print(p,m.group(1),sep='\t')

n=0
for l in sys.stdin:
  n+=1
  if n%1000==0:
    print(int(n/1000),file=sys.stderr)
  if l[0]=='{' and '"WARC-Type":"response"' in l:
    j=json.loads(l)
    for s in META_PATH:
      j=j[s]
    for k,v in PATHS.items():
      p=j
      try:
        for s in v:
          p=p[s]
      except KeyError:
        continue
      walk(p,pp,k)