changeset 62:11cbaee8bbc8

Test 2 works with parts=1,2,3. Tests 3 and 4 work; Test 1 works with parts=1, gives correct output for warcinfo and metadata with parts=1,2,3.
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 14 Jul 2023 17:38:54 +0100
parents f182d09ad1cd
children 9837840f3328
files bin/warc.py
diffstat 1 files changed, 52 insertions(+), 54 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Fri Jul 14 12:08:09 2023 +0100
+++ b/bin/warc.py	Fri Jul 14 17:38:54 2023 +0100
@@ -6,11 +6,17 @@
 import sys,io
 from isal import igzip
 
+RESP = b'response'
+REQ = b'request'
+META = b'metadata'
+INFO = b'warcinfo'
+
 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
   '''parts is a bit-mask:
      1 for warc header;
      2 for req/resp HTTP header, warcinfo/metadata features;
      4 for req/resp body'''
+  # should do some sanity checking wrt parts and types
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
   if filename.endswith(".gz"):
@@ -52,13 +58,13 @@
           tr=bytes(bufView[bp+16:eol-2])
       elif buf.startswith(b'WARC-Type: ',bp):
         if buf.startswith(b's',bp+13):
-          wtype = b'response'
+          wtype = RESP
         elif buf.startswith(b'q',bp+13):
-          wtype = b'request'
+          wtype = REQ
         elif buf.startswith(b'm',bp+11):
-          wtype = b'metadata'
+          wtype = META
         elif buf.startswith(b'w',bp+11):
-          wtype = b'warcinfo'
+          wtype = INFO
         else:
           raise ValueError("Unknown WARC-Type: %s at %s"%(
                              bytes(bufView[bp+11:eol-2]),
@@ -101,6 +107,7 @@
       if wtype not in types:
         continue
     if (wtype in types):
+      # Output whole or part 1 as required
       if whole:
         bp+=length
         OUT=callback(wtype,bufView[start_1:bp],7)
@@ -108,56 +115,47 @@
       elif (parts & 1):
         OUT=callback(wtype,bufView[start_1:eol],1)
       if parts!=1:
-        bv=bufView[start_2:start_2+length]
-        ii=0
-        while True and not stream.closed:
-          if (i:=stream.readinto(bv))==0:
-            break
-          ii+=i
-          if ii>=length:
-            break
-          bv=memoryview(buf)[start_2+ii:start_2+length]
-        if ii!=length:
-          raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
-        nb+=length
-        if wtype in types:
-          if whole:
-            callback(wtype,bufView[0:start_2+length],7)
-            continue
-          # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted
-          bl=None # for HTTP Content-Length for the length of the body?
-          L_start=start_2
-          state=2
-          bv=memoryview(buf)[start_2:start_2+length]
-          with io.BytesIO(bv) as rec_text:
-            for L in rec_text:
-              if state==2:
-                # HTTP header
-                wl -= len(L)
-                if not (L==b"" or L.startswith(b"\r")):
-                  # Non-empty, it's (a continuation of) a header
-                  if bl is None and L.startswith(b"Content-Length: "):
-                    bl=int(L[16:].rstrip())
-                else:
-                  # Blank line, HTTP header is finished
-                  if parts & 2:
-                    callback(wtype,bufView[start_2:start_2+L_start],2)
-                  state=4
-                  # The above is just for sanity, because we do _not_
-                  #  continue with the outer loop,
-                  #  since we can now block-output the entire rest of the
-                  #  input buffer.
-                  if bl is not None:
-                    if bl!=wl:
-                      print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
-                            (length,offset,filename,wl,bl,tr),file=sys.stderr)
-                  # HTTP body
-                  balance=start_2+rec_text.tell()
-                  #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
-                  # Output whatever is left
-                  if parts & 4:
-                    callback(wtype,bufView[balance:balance+wl],4)
-                  state=1
+        while buf.startswith(b'\r\n',bp):
+          bp+=2
+        start_2=bp
+        eob=bp+length
+        while buf.startswith(b'\r\n',eob-2):
+          eob-=2
+        bv=bufView[start_2:eob]
+        # Only output parts (2 = HTTP header, 4 = body) that are wanted
+        if parts & 2:
+          if wtype is META or wtype is INFO:
+            # rest of the part
+            OUT=callback(wtype,bv,2)
+        if parts & 4:
+          for L in rec_text:
+            if state==2:
+              # HTTP header
+              wl -= len(L)
+              if not (L==b"" or L.startswith(b"\r")):
+                # Non-empty, it's (a continuation of) a header
+                if bl is None and L.startswith(b"Content-Length: "):
+                  bl=int(L[16:].rstrip())
+              else:
+                # Blank line, HTTP header is finished
+                if parts & 2:
+                  callback(wtype,bufView[start_2:start_2+L_start],2)
+                state=4
+                # The above is just for sanity, because we do _not_
+                #  continue with the outer loop,
+                #  since we can now block-output the entire rest of the
+                #  input buffer.
+                if bl is not None:
+                  if bl!=wl:
+                    print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                          (length,offset,filename,wl,bl,tr),file=sys.stderr)
+                # HTTP body
+                balance=start_2+rec_text.tell()
+                #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+                # Output whatever is left
+                if parts & 4:
+                  callback(wtype,bufView[balance:balance+wl],4)
+                state=1
 
               L_start=rec_text.tell()
     bp+=length