diff master/src/wecu/sac_schemes.py @ 61:cfaf5223b071

trying to get my own mapper working
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 31 May 2020 12:06:44 +0000
parents
children 892e1c0240e1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/master/src/wecu/sac_schemes.py	Sun May 31 12:06:44 2020 +0000
@@ -0,0 +1,70 @@
+#!/usr/bin/python3
+'''Assumes export PYTHONIOENCODING=utf-8 has been done if necessary'''
+
+import sys, json, regex
+from collections.abc import Iterable
+
+META_PATH=['Envelope', 'Payload-Metadata', 'HTTP-Response-Metadata']
+
+PATHS={'hdr':['Headers'],
+       'head':['HTML-Metadata','Head'],
+       'body':['HTML-Metadata','Links']}
+
+SCHEME=regex.compile('(<?[a-zA-Z][a-zA-Z0-9+.-]*):')
+URN=regex.compile('(<?urn:[a-z][a-z0-9+.-]*):',regex.I)
+
+def walk(o,f,path=""):
+  '''Apply f to every key+leaf of a json object'''
+  if isinstance(o,dict):
+    for k,v in o.items():
+      if isinstance(v,dict):
+        walk(v,f,"%s.%s"%(path,k))
+      elif isinstance(v,Iterable):
+        walked=False
+        for i in v:
+          if isinstance(i,dict):
+            if (not walked) and (i is not v[0]):
+              print('oops',path,k,i,file=sys.stderr)
+            walked=True
+            walk(i,f,"%s.%s"%(path,k))
+          elif walked:
+            print('oops2',path,k,i,file=sys.stderr)
+        if not walked:
+          f(k,v,"%s.%s"%(path,k))
+      else:
+        f(k,v,"%s.%s"%(path,k))
+  elif isinstance(o,Iterable):
+    for i in o:
+      walk(i,f,path)
+
+def pp(k,v,p):
+  if isinstance(v,str):
+    m=SCHEME.match(v)
+    if m is not None:
+      try:
+        n=v.index('\n')
+        v=v[:n]
+      except ValueError:
+        pass
+      n=URN.match(v)
+      if n is not None:
+        m=n
+      print(p,m.group(1),sep='\t')
+
+n=0
+for l in sys.stdin:
+  n+=1
+  if n%1000==0:
+    print(int(n/1000),file=sys.stderr)
+  if l[0]=='{' and '"WARC-Type":"response"' in l:
+    j=json.loads(l)
+    for s in META_PATH:
+      j=j[s]
+    for k,v in PATHS.items():
+      p=j
+      try:
+        for s in v:
+          p=p[s]
+      except KeyError:
+        continue
+      walk(p,pp,k)