changeset 138:9ea12f7b304b

just barely working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 23 Jul 2021 16:23:46 +0000
parents bb0153be65b5
children e96d444b0f84
files bin/warc.py
diffstat 1 files changed, 117 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/warc.py	Fri Jul 23 16:23:46 2021 +0000
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+'''Stream a warc format file, invoking a callback on each part.
+Callback can be limited by WARC-Type'''
+import sys
+
+def warc(callback,types=['response']):
+  nb=0
+  stream=open(sys.stdin.fileno(),'rb',0)
+  bufsize=128*1024*1024
+  buf=bytearray(128*1024*1024)
+  l=b'\r\n'
+  while True:
+    while l==b'\r\n':
+      l=stream.readline()
+      nb+=len(l)
+    if l!=b'WARC/1.0\r\n':
+      if l==0:
+        return
+      raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
+                                                         l.decode('latin-1'),len(l)))
+    wtype=None
+    length=None
+    while l!=b'\r\n':
+      l=stream.readline()
+      nb+=len(l)
+      if l.startswith(b'WARC-Type: '):
+        wtype = l[11:-2]
+      elif l.startswith(b'Content-Length: '):
+        length = int(l[16:])
+    bv=memoryview(buf)[:length]
+    ii=0
+    while True:
+      i=stream.readinto(bv)
+      ii+=i
+      if ii>=length:
+        break
+      bv=memoryview(buf)[ii:length]
+    if ii!=length:
+      raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
+    nb+=length
+    if wtype in types:
+      callback(wtype,bv)
+  if whole and options.zipped:
+    _output(bv)
+    return
+  gzip_chunk = io.BytesIO(bv)
+  uv=memoryview(buf)[length:]
+  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
+    ll=0
+    while True:
+      l=gzip_fin.readinto(uv)
+      if not l:
+        break
+      ll+=l
+    cb=memoryview(uv)[:ll]
+    if whole:
+      _output(cb)
+      return
+  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
+  state=0
+  tr=None # Was this record truncated?
+  bl=None # for HTTP Content-Length for the length of the body?
+  with io.BytesIO(cb) as clear_text:
+    for L in clear_text:
+      if state==0:
+        # WARC header
+        if L.startswith(b"Content-Length: "):
+          wl=int(L[16:].rstrip())
+        elif L.startswith(b"WARC-Truncated: "):
+          tr=L[16:].rstrip()
+          tr="EMPTY" if tr=="" else tr
+        elif L==b"" or L.startswith(b"\r"): # for idempotency
+          # Blank line, WARC header is finished
+          if not (options.headers or options.body):
+            return
+          state=1
+          # Note we preserve the empty line
+        if options.warc:
+          _output(L)
+        continue
+      if state==1:
+        # HTTP header
+        wl -= len(L)
+        if not (L==b"" or L.startswith(b"\r")):
+          # Non-blank, it's a header
+          if bl is None and L.startswith(b"Content-Length: "):
+            bl=int(L[16:].rstrip())
+          if options.headers:
+            _output(L)
+        else:
+          # Blank line, HTTP header is finished
+          if not options.body:
+            return
+          if options.headers:
+            _output(L)
+          state=2
+          # The above is just for sanity, because we do _not_
+          #  continue with the outer loop,
+          #  since we can now block-output the entire rest of the
+          #  input buffer.
+          if bl is not None:
+            if bl!=wl:
+              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
+          # HTTP body
+          balance=clear_text.tell()
+          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+          # Output whatever is left
+          _output(cb[balance:balance+wl])
+          return
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+def showme(wtype,buf):
+  OUT.write(buf)
+
+warc(showme,[b'metadata'])