Mercurial > hg > cc > cirrus_work
changeset 59:5d40d7511374
avoid slicing buf by using memoryview to save copying
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 13 Jul 2023 11:28:24 +0100 |
parents | 299e3d0f2310 |
children | 7b68c3ebc35a |
files | bin/warc.py |
diffstat | 1 files changed, 23 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Wed Jul 12 19:07:56 2023 +0100 +++ b/bin/warc.py Thu Jul 13 11:28:24 2023 +0100 @@ -16,7 +16,7 @@ bufSize=2*1024*1024 hdrMax=16*1024 buf=bytearray(bufSize) - #with memoryview(buf)[:hdrMax] as hdrBuf: + bufView=memoryview(buf) fpos=bl=stream.readinto(buf) bp=0 done=False @@ -40,12 +40,25 @@ # because of the buffer update logic below eol=buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): - length=wl=int(buf[bp+16:eol-2]) + length=wl=int(bufView[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): - tr=bytes(buf[bp+16:eol-2]) - tr=b"EMPTY" if tr==b"" else tr + if bp+16==eol-2: + tr=b"EMPTY" + else: + tr=bytes(bufView[bp+16:eol-2]) elif buf.startswith(b'WARC-Type: ',bp): - wtype = bytes(buf[bp+11:eol-2]) + if buf.startswith(b's',bp+13): + wtype = b'response' + elif buf.startswith(b'q',bp+13): + wtype = b'request' + elif buf.startswith(b'm',bp+11): + wtype = b'metadata' + elif buf.startswith(b'w',bp+11): + wtype = b'warcinfo' + else: + raise ValueError("Unknown WARC-Type: %s at %s"%( + bytes(bufView[bp+11:eol-2]), + fpos-(bl-bp))) bp=eol bp=eol+2 if done: @@ -58,7 +71,7 @@ # we need to keep from start_1 to bl keepFrom=start_1 keepLen=bl-keepFrom - buf[0:keepLen]=buf[keepFrom:bl] + buf[0:keepLen]=bufView[keepFrom:bl] else: # we can skip the rest of this part keepLen=0 @@ -79,7 +92,7 @@ if whole: pass # buf[bp:(bp:=bp+ln)]=l @fixme elif (parts & 1): - OUT=callback(wtype,buf[start_1:eol],1) + OUT=callback(wtype,bufView[start_1:eol],1) if parts!=1: # everything from bv= goes here pass @@ -101,7 +114,7 @@ nb+=length if wtype in types: if whole: - callback(wtype,buf[0:start_2+length],7) + callback(wtype,bufView[0:start_2+length],7) continue # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted bl=None # for HTTP Content-Length for the length of the body? @@ -120,7 +133,7 @@ else: # Blank line, HTTP header is finished if parts & 2: - callback(wtype,buf[start_2:start_2+L_start],2) + callback(wtype,bufView[start_2:start_2+L_start],2) state=4 # The above is just for sanity, because we do _not_ # continue with the outer loop, @@ -135,7 +148,7 @@ #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) # Output whatever is left if parts & 4: - callback(wtype,buf[balance:balance+wl],4) + callback(wtype,bufView[balance:balance+wl],4) state=1 L_start=rec_text.tell()