changeset 61:f182d09ad1cd

whole working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 14 Jul 2023 12:08:09 +0100
parents 7b68c3ebc35a
children 11cbaee8bbc8
files bin/warc.py
diffstat 1 files changed, 60 insertions(+), 56 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Thu Jul 13 14:02:02 2023 +0100
+++ b/bin/warc.py	Fri Jul 14 12:08:09 2023 +0100
@@ -7,6 +7,10 @@
 from isal import igzip
 
 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
+  '''parts is a bit-mask:
+     1 for warc header;
+     2 for req/resp HTTP header, warcinfo/metadata features;
+     4 for req/resp body'''
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
   if filename.endswith(".gz"):
@@ -98,65 +102,65 @@
         continue
     if (wtype in types):
       if whole:
-        pass # buf[bp:(bp:=bp+ln)]=l @fixme
+        bp+=length
+        OUT=callback(wtype,bufView[start_1:bp],7)
+        continue
       elif (parts & 1):
         OUT=callback(wtype,bufView[start_1:eol],1)
       if parts!=1:
-        # everything from bv= goes here
-        pass
+        bv=bufView[start_2:start_2+length]
+        ii=0
+        while True and not stream.closed:
+          if (i:=stream.readinto(bv))==0:
+            break
+          ii+=i
+          if ii>=length:
+            break
+          bv=memoryview(buf)[start_2+ii:start_2+length]
+        if ii!=length:
+          raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
+        nb+=length
+        if wtype in types:
+          if whole:
+            callback(wtype,bufView[0:start_2+length],7)
+            continue
+          # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted
+          bl=None # for HTTP Content-Length for the length of the body?
+          L_start=start_2
+          state=2
+          bv=memoryview(buf)[start_2:start_2+length]
+          with io.BytesIO(bv) as rec_text:
+            for L in rec_text:
+              if state==2:
+                # HTTP header
+                wl -= len(L)
+                if not (L==b"" or L.startswith(b"\r")):
+                  # Non-empty, it's (a continuation of) a header
+                  if bl is None and L.startswith(b"Content-Length: "):
+                    bl=int(L[16:].rstrip())
+                else:
+                  # Blank line, HTTP header is finished
+                  if parts & 2:
+                    callback(wtype,bufView[start_2:start_2+L_start],2)
+                  state=4
+                  # The above is just for sanity, because we do _not_
+                  #  continue with the outer loop,
+                  #  since we can now block-output the entire rest of the
+                  #  input buffer.
+                  if bl is not None:
+                    if bl!=wl:
+                      print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                            (length,offset,filename,wl,bl,tr),file=sys.stderr)
+                  # HTTP body
+                  balance=start_2+rec_text.tell()
+                  #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+                  # Output whatever is left
+                  if parts & 4:
+                    callback(wtype,bufView[balance:balance+wl],4)
+                  state=1
+
+              L_start=rec_text.tell()
     bp+=length
-    print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
+    #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
     #while not buf.startswith(b'\r\n',bp):
     continue
-    bv=memoryview(buf)[start_2:start_2+length]
-    ii=0
-    while True and not stream.closed:
-      if (i:=stream.readinto(bv))==0:
-        break
-      ii+=i
-      if ii>=length:
-        break
-      bv=memoryview(buf)[start_2+ii:start_2+length]
-    if ii!=length:
-      raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
-    nb+=length
-    if wtype in types:
-      if whole:
-        callback(wtype,bufView[0:start_2+length],7)
-        continue
-      # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted
-      bl=None # for HTTP Content-Length for the length of the body?
-      L_start=start_2
-      state=2
-      bv=memoryview(buf)[start_2:start_2+length]
-      with io.BytesIO(bv) as rec_text:
-        for L in rec_text:
-          if state==2:
-            # HTTP header
-            wl -= len(L)
-            if not (L==b"" or L.startswith(b"\r")):
-              # Non-empty, it's (a continuation of) a header
-              if bl is None and L.startswith(b"Content-Length: "):
-                bl=int(L[16:].rstrip())
-            else:
-              # Blank line, HTTP header is finished
-              if parts & 2:
-                callback(wtype,bufView[start_2:start_2+L_start],2)
-              state=4
-              # The above is just for sanity, because we do _not_
-              #  continue with the outer loop,
-              #  since we can now block-output the entire rest of the
-              #  input buffer.
-              if bl is not None:
-                if bl!=wl:
-                  print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
-                        (length,offset,filename,wl,bl,tr),file=sys.stderr)
-              # HTTP body
-              balance=start_2+rec_text.tell()
-              #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
-              # Output whatever is left
-              if parts & 4:
-                callback(wtype,bufView[balance:balance+wl],4)
-              state=1
-              
-          L_start=rec_text.tell()