changeset 164:4315a36b1672

refactor to provide for buffer overflow fix
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 31 Oct 2023 14:03:02 +0000
parents 348f4a31228f
children 26dfef7854f4
files lib/python/cc/warc.py
diffstat 1 files changed, 66 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/warc.py	Tue Oct 31 14:01:50 2023 +0000
+++ b/lib/python/cc/warc.py	Tue Oct 31 14:03:02 2023 +0000
@@ -3,7 +3,7 @@
 callback on each record.  Callback can be limited by WARC-Type, record
 part'''
 
-import sys,io
+import sys, io
 from isal import igzip
 
 RESP = b'response'
@@ -11,11 +11,57 @@
 META = b'metadata'
 INFO = b'warcinfo'
 
+BUFSIZE=2*1024*1024
+HDRMAX=32*1024  # Not really max, there are some enormous ones, see below
+
+def refill(buf, bufView, stream, start_1, bl, bp, eol, length, needed):
+  global BUFSIZE
+  #if (stream.tell() > 2381000000):
+  #  breakpoint()
+  if needed:
+    # we need to keep from start_1 to bl
+    keepFrom=start_1
+    keepLen=bl-keepFrom
+    if (whole:=((bp-start_1)+length)) > BUFSIZE:
+      while whole > BUFSIZE:
+        # Need a bigger buffer
+        print('Growing buffer %s > %s'%(whole,BUFSIZE),file=sys.stderr)
+        BUFSIZE=BUFSIZE+(64 * 1024)
+      newbuf = bytearray(BUFSIZE)
+      newbuf[0:keepLen]=bufView[keepFrom:bl]
+      bl = BUFSIZE
+      buf = newbuf
+      bufView = memoryview(buf)
+    else:
+      buf[0:keepLen]=bufView[keepFrom:bl]
+    eol=eol-start_1
+    start_1=0
+    bp=eol+2
+  else:
+    # we can skip the rest of this part
+    if (bp+length)<=bl:
+      # we have at least some bytes from the next part
+      keepLen=bl-(bp+length)
+      buf[0:keepLen]=bufView[bl-keepLen:bl]
+    else:
+      # we don't have all of the bytes from the current part
+      #  so can skip the rest of it
+      keepLen=0
+      stream.seek(stream.tell() + bp + length - bl)
+    bp=0
+  spaceToFill=BUFSIZE-keepLen
+  with memoryview(buf)[keepLen:BUFSIZE] as xBuf:
+    nb=stream.readinto(xBuf)
+  bl=keepLen+nb
+  return start_1, bp, eol, buf, bl, bufView, nb<spaceToFill
+  
+
 def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
   '''parts is a bit-mask:
      1 for warc header;
      2 for req/resp HTTP header, warcinfo/metadata features;
      4 for req/resp body'''
+  global BUFSIZE, HDRMAX
   # should do some sanity checking wrt parts and types
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
@@ -23,9 +69,7 @@
     stream=igzip.IGzipFile(filename=filename)
   else:
     stream=open(filename,'rb',0)
-  bufSize=2*1024*1024
-  hdrMax=16*1024
-  buf=bytearray(bufSize)
+  buf=bytearray(BUFSIZE)
   bufView=memoryview(buf)
   fpos=bl=stream.readinto(buf)
   bp=0
@@ -50,7 +94,20 @@
     while not buf.startswith(b'\r\n',bp):
       # there should always be enough in the buffer to complete this loop,
       #  because of the buffer update logic below
-      eol=buf.index(b'\r\n',bp)+2
+      try:
+        eol = buf.index(b'\r\n',bp)+2
+      except ValueError:
+        # there are some enormous TargetURIs which overflow HDRMAX
+        # so we do an emergency buffer shift, forcing the restart
+        #  because skipping won't work as we're not at the end of the WARC
+        #  header yet
+        if not buf.startswith(b'WARC-Target-URI: ',bp):
+          raise
+        start_1, bp, _, buf, bl, bufView, done = refill(buf, bufView, stream,
+                                                        start_1, bl, bp, eol,
+                                                        length, True)
+        bp -= 2 # situation is slightly different from the other call to refill
+        eol = buf.index(b'\r\n',bp)+2
       if buf.startswith(b"Content-Length: ",bp):
         length=wl=int(bufView[bp+16:eol-2])
       elif buf.startswith(b"WARC-Truncated: ",bp):
@@ -77,35 +134,11 @@
       if (bp+length)>bl:
         raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
                          length,bl,filename))
-    elif (bp+(length+hdrMax))>bl:
+    elif (bp+(length+HDRMAX))>bl:
       # Need more data
-      if wtype in types:
-        # we need to keep from start_1 to bl
-        keepFrom=start_1
-        keepLen=bl-keepFrom
-        buf[0:keepLen]=bufView[keepFrom:bl]
-        eol=eol-start_1
-        start_1=0
-        bp=eol+2
-      else:
-        # we can skip the rest of this part
-        if (bp+length)<=bl:
-          # we have at least some bytes from the next part
-          keepLen=bl-(bp+length)
-          buf[0:keepLen]=bufView[bl-keepLen:bl]
-        else:
-          # we don't have all of the bytes from the current part
-          #  so can skip the rest of it
-          keepLen=0
-          fpos=stream.seek(fpos+(bp+length-bl))
-        bp=0
-      spaceToFill=bufSize-keepLen
-      with memoryview(buf)[keepLen:bufSize] as xBuf:
-        nb=stream.readinto(xBuf)
-      fpos+=nb
-      bl=keepLen+nb
-      if nb<spaceToFill:
-        done=True
+      start_1, bp, eol, buf, bl, bufView, done = refill(buf, bufView, stream,
+                                                        start_1, bl, bp, eol,
+                                                        length, wtype in types)
       if wtype not in types:
         continue
     if (wtype in types):