changeset 59:5d40d7511374

avoid slicing buf by using memoryview to save copying
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 13 Jul 2023 11:28:24 +0100
parents 299e3d0f2310
children 7b68c3ebc35a
files bin/warc.py
diffstat 1 files changed, 23 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Wed Jul 12 19:07:56 2023 +0100
+++ b/bin/warc.py	Thu Jul 13 11:28:24 2023 +0100
@@ -16,7 +16,7 @@
   bufSize=2*1024*1024
   hdrMax=16*1024
   buf=bytearray(bufSize)
-  #with memoryview(buf)[:hdrMax] as hdrBuf:
+  bufView=memoryview(buf)
   fpos=bl=stream.readinto(buf)
   bp=0
   done=False
@@ -40,12 +40,25 @@
       #  because of the buffer update logic below
       eol=buf.index(b'\r\n',bp)+2
       if buf.startswith(b"Content-Length: ",bp):
-        length=wl=int(buf[bp+16:eol-2])
+        length=wl=int(bufView[bp+16:eol-2])
       elif buf.startswith(b"WARC-Truncated: ",bp):
-        tr=bytes(buf[bp+16:eol-2])
-        tr=b"EMPTY" if tr==b"" else tr
+        if bp+16==eol-2:
+          tr=b"EMPTY"
+        else:
+          tr=bytes(bufView[bp+16:eol-2])
       elif buf.startswith(b'WARC-Type: ',bp):
-        wtype = bytes(buf[bp+11:eol-2])
+        if buf.startswith(b's',bp+13):
+          wtype = b'response'
+        elif buf.startswith(b'q',bp+13):
+          wtype = b'request'
+        elif buf.startswith(b'm',bp+11):
+          wtype = b'metadata'
+        elif buf.startswith(b'w',bp+11):
+          wtype = b'warcinfo'
+        else:
+          raise ValueError("Unknown WARC-Type: %s at %s"%(
+                             bytes(bufView[bp+11:eol-2]),
+                             fpos-(bl-bp)))
       bp=eol
     bp=eol+2
     if done:
@@ -58,7 +71,7 @@
         # we need to keep from start_1 to bl
         keepFrom=start_1
         keepLen=bl-keepFrom
-        buf[0:keepLen]=buf[keepFrom:bl]
+        buf[0:keepLen]=bufView[keepFrom:bl]
       else:
         # we can skip the rest of this part
         keepLen=0
@@ -79,7 +92,7 @@
       if whole:
         pass # buf[bp:(bp:=bp+ln)]=l @fixme
       elif (parts & 1):
-        OUT=callback(wtype,buf[start_1:eol],1)
+        OUT=callback(wtype,bufView[start_1:eol],1)
       if parts!=1:
         # everything from bv= goes here
         pass
@@ -101,7 +114,7 @@
     nb+=length
     if wtype in types:
       if whole:
-        callback(wtype,buf[0:start_2+length],7)
+        callback(wtype,bufView[0:start_2+length],7)
         continue
       # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted
       bl=None # for HTTP Content-Length for the length of the body?
@@ -120,7 +133,7 @@
             else:
               # Blank line, HTTP header is finished
               if parts & 2:
-                callback(wtype,buf[start_2:start_2+L_start],2)
+                callback(wtype,bufView[start_2:start_2+L_start],2)
               state=4
               # The above is just for sanity, because we do _not_
               #  continue with the outer loop,
@@ -135,7 +148,7 @@
               #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
               # Output whatever is left
               if parts & 4:
-                callback(wtype,buf[balance:balance+wl],4)
+                callback(wtype,bufView[balance:balance+wl],4)
               state=1
               
           L_start=rec_text.tell()