# HG changeset patch # User Henry S. Thompson # Date 1698760982 0 # Node ID 4315a36b1672a5444e1253829b8d5820f4b02b71 # Parent 348f4a31228f2e956e3a037ba8fbbf1838d83931 refactor to provide for buffer overflow fix diff -r 348f4a31228f -r 4315a36b1672 lib/python/cc/warc.py --- a/lib/python/cc/warc.py Tue Oct 31 14:01:50 2023 +0000 +++ b/lib/python/cc/warc.py Tue Oct 31 14:03:02 2023 +0000 @@ -3,7 +3,7 @@ callback on each record. Callback can be limited by WARC-Type, record part''' -import sys,io +import sys, io from isal import igzip RESP = b'response' @@ -11,11 +11,57 @@ META = b'metadata' INFO = b'warcinfo' +BUFSIZE=2*1024*1024 +HDRMAX=32*1024 # Not really max, there are some enormous ones, see below + +def refill(buf, bufView, stream, start_1, bl, bp, eol, length, needed): + global BUFSIZE + #if (stream.tell() > 2381000000): + # breakpoint() + if needed: + # we need to keep from start_1 to bl + keepFrom=start_1 + keepLen=bl-keepFrom + if (whole:=((bp-start_1)+length)) > BUFSIZE: + while whole > BUFSIZE: + # Need a bigger buffer + print('Growing buffer %s > %s'%(whole,BUFSIZE),file=sys.stderr) + BUFSIZE=BUFSIZE+(64 * 1024) + newbuf = bytearray(BUFSIZE) + newbuf[0:keepLen]=bufView[keepFrom:bl] + bl = BUFSIZE + buf = newbuf + bufView = memoryview(buf) + else: + buf[0:keepLen]=bufView[keepFrom:bl] + eol=eol-start_1 + start_1=0 + bp=eol+2 + else: + # we can skip the rest of this part + if (bp+length)<=bl: + # we have at least some bytes from the next part + keepLen=bl-(bp+length) + buf[0:keepLen]=bufView[bl-keepLen:bl] + else: + # we don't have all of the bytes from the current part + # so can skip the rest of it + keepLen=0 + stream.seek(stream.tell() + bp + length - bl) + bp=0 + spaceToFill=BUFSIZE-keepLen + with memoryview(buf)[keepLen:BUFSIZE] as xBuf: + nb=stream.readinto(xBuf) + bl=keepLen+nb + return start_1, bp, eol, buf, bl, bufView, nbbl: raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, length,bl,filename)) - elif (bp+(length+hdrMax))>bl: + elif (bp+(length+HDRMAX))>bl: # Need more data - if wtype in types: - # we need to keep from start_1 to bl - keepFrom=start_1 - keepLen=bl-keepFrom - buf[0:keepLen]=bufView[keepFrom:bl] - eol=eol-start_1 - start_1=0 - bp=eol+2 - else: - # we can skip the rest of this part - if (bp+length)<=bl: - # we have at least some bytes from the next part - keepLen=bl-(bp+length) - buf[0:keepLen]=bufView[bl-keepLen:bl] - else: - # we don't have all of the bytes from the current part - # so can skip the rest of it - keepLen=0 - fpos=stream.seek(fpos+(bp+length-bl)) - bp=0 - spaceToFill=bufSize-keepLen - with memoryview(buf)[keepLen:bufSize] as xBuf: - nb=stream.readinto(xBuf) - fpos+=nb - bl=keepLen+nb - if nb