Mercurial > hg > cc > cirrus_work
changeset 164:4315a36b1672
refactor to provide for buffer overflow fix
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 31 Oct 2023 14:03:02 +0000 |
parents | 348f4a31228f |
children | 26dfef7854f4 |
files | lib/python/cc/warc.py |
diffstat | 1 files changed, 66 insertions(+), 33 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/warc.py Tue Oct 31 14:01:50 2023 +0000 +++ b/lib/python/cc/warc.py Tue Oct 31 14:03:02 2023 +0000 @@ -3,7 +3,7 @@ callback on each record. Callback can be limited by WARC-Type, record part''' -import sys,io +import sys, io from isal import igzip RESP = b'response' @@ -11,11 +11,57 @@ META = b'metadata' INFO = b'warcinfo' +BUFSIZE=2*1024*1024 +HDRMAX=32*1024 # Not really max, there are some enormous ones, see below + +def refill(buf, bufView, stream, start_1, bl, bp, eol, length, needed): + global BUFSIZE + #if (stream.tell() > 2381000000): + # breakpoint() + if needed: + # we need to keep from start_1 to bl + keepFrom=start_1 + keepLen=bl-keepFrom + if (whole:=((bp-start_1)+length)) > BUFSIZE: + while whole > BUFSIZE: + # Need a bigger buffer + print('Growing buffer %s > %s'%(whole,BUFSIZE),file=sys.stderr) + BUFSIZE=BUFSIZE+(64 * 1024) + newbuf = bytearray(BUFSIZE) + newbuf[0:keepLen]=bufView[keepFrom:bl] + bl = BUFSIZE + buf = newbuf + bufView = memoryview(buf) + else: + buf[0:keepLen]=bufView[keepFrom:bl] + eol=eol-start_1 + start_1=0 + bp=eol+2 + else: + # we can skip the rest of this part + if (bp+length)<=bl: + # we have at least some bytes from the next part + keepLen=bl-(bp+length) + buf[0:keepLen]=bufView[bl-keepLen:bl] + else: + # we don't have all of the bytes from the current part + # so can skip the rest of it + keepLen=0 + stream.seek(stream.tell() + bp + length - bl) + bp=0 + spaceToFill=BUFSIZE-keepLen + with memoryview(buf)[keepLen:BUFSIZE] as xBuf: + nb=stream.readinto(xBuf) + bl=keepLen+nb + return start_1, bp, eol, buf, bl, bufView, nb<spaceToFill + + def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): '''parts is a bit-mask: 1 for warc header; 2 for req/resp HTTP header, warcinfo/metadata features; 4 for req/resp body''' + global BUFSIZE, HDRMAX # should do some sanity checking wrt parts and types types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 @@ -23,9 +69,7 @@ stream=igzip.IGzipFile(filename=filename) else: stream=open(filename,'rb',0) - bufSize=2*1024*1024 - hdrMax=16*1024 - buf=bytearray(bufSize) + buf=bytearray(BUFSIZE) bufView=memoryview(buf) fpos=bl=stream.readinto(buf) bp=0 @@ -50,7 +94,20 @@ while not buf.startswith(b'\r\n',bp): # there should always be enough in the buffer to complete this loop, # because of the buffer update logic below - eol=buf.index(b'\r\n',bp)+2 + try: + eol = buf.index(b'\r\n',bp)+2 + except ValueError: + # there are some enormous TargetURIs which overflow HDRMAX + # so we do an emergency buffer shift, forcing the restart + # because skipping won't work as we're not at the end of the WARC + # header yet + if not buf.startswith(b'WARC-Target-URI: ',bp): + raise + start_1, bp, _, buf, bl, bufView, done = refill(buf, bufView, stream, + start_1, bl, bp, eol, + length, True) + bp -= 2 # situation is slightly different from the other call to refill + eol = buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): length=wl=int(bufView[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): @@ -77,35 +134,11 @@ if (bp+length)>bl: raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, length,bl,filename)) - elif (bp+(length+hdrMax))>bl: + elif (bp+(length+HDRMAX))>bl: # Need more data - if wtype in types: - # we need to keep from start_1 to bl - keepFrom=start_1 - keepLen=bl-keepFrom - buf[0:keepLen]=bufView[keepFrom:bl] - eol=eol-start_1 - start_1=0 - bp=eol+2 - else: - # we can skip the rest of this part - if (bp+length)<=bl: - # we have at least some bytes from the next part - keepLen=bl-(bp+length) - buf[0:keepLen]=bufView[bl-keepLen:bl] - else: - # we don't have all of the bytes from the current part - # so can skip the rest of it - keepLen=0 - fpos=stream.seek(fpos+(bp+length-bl)) - bp=0 - spaceToFill=bufSize-keepLen - with memoryview(buf)[keepLen:bufSize] as xBuf: - nb=stream.readinto(xBuf) - fpos+=nb - bl=keepLen+nb - if nb<spaceToFill: - done=True + start_1, bp, eol, buf, bl, bufView, done = refill(buf, bufView, stream, + start_1, bl, bp, eol, + length, wtype in types) if wtype not in types: continue if (wtype in types):