Mercurial > hg > cc > cirrus_work
changeset 56:f8c8f79b2532
rework completely to refill as much as possible only when necessary,
basic loop working again, but not refill
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 10 Jul 2023 19:52:18 +0100 |
parents | 11a886a84a49 |
children | 61b0a1582af8 |
files | bin/warc.py |
diffstat | 1 files changed, 41 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Mon Jul 10 18:17:35 2023 +0100 +++ b/bin/warc.py Mon Jul 10 19:52:18 2023 +0100 @@ -16,23 +16,28 @@ bufSize=2*1024*1024 hdrMax=16*1024 buf=bytearray(bufSize) - with memoryview(buf)[:hdrMax] as hdrBuf: - fpos=bl=stream.readinto(hdrBuf) + #with memoryview(buf)[:hdrMax] as hdrBuf: + fpos=bl=stream.readinto(buf) + bp=0 + done=False while True: - bp=0 - while buf.startswith(b'\r\n',bp): + while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty bp+=2 - bob=bp + start_1=bp if not buf.startswith(b'WARC/1.0\r\n',bp): + if done and bl-bp==0: + # really done + return raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) bp+=10 wtype=None length=None state=1 - done=False tr=None # Was this record truncated? while not buf.startswith(b'\r\n',bp): + # there should always be enough in the buffer to complete this loop, + # because of the buffer update logic below eol=buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): length=wl=int(buf[bp+16:eol-2]) @@ -43,33 +48,45 @@ wtype = bytes(buf[bp+11:eol-2]) bp=eol bp=eol+2 + if (bp+length)>bl: + if done: + raise ValueError("Done but need more! %s + %s > %s",bp,length,bl) + # Need more data + if wtype in types: + # we need to keep from start_1 to bl + keepFrom=start_1 + keepLen=bl-keepFrom + buf[0:keepLen]=buf[keepFrom,bl] + else: + # we can skip the rest of this part + keepLen=0 + fpos=stream.seek(fpos+(pb+length-bl)) + spaceToFill=bufMax-keepLen + with memoryview(buf)[keepLen:bufMax] as xBuf: + nb=stream.readinto(xBuf) + fpos+=nb + bp=keepLen + bl=keepLen+nb + if nb<spaceToFill: + done=True + if wtype not in types: + continue if (wtype in types): if whole: - pass # buf[bp:(bp:=bp+ln)]=l + pass # buf[bp:(bp:=bp+ln)]=l @fixme elif (parts & 1): - OUT=callback(wtype,buf[bob:eol],1) + OUT=callback(wtype,buf[start_1:eol],1) if parts!=1: # everything from bv= goes here - # need to read more if eol+length>hdrMax pass - print(wtype,bob,bp,eol,length,file=sys.stderr) - jumpTo=bp+length - buf[0:jumpTo]=buf[jumpTo:hdrMax] - _fpos=stream.seek(fpos:=fpos+jumpTo) - print('fp',_fpos,fpos,file=sys.stderr) - if done: - print('finished',file=sys.stderr) - break - with memoryview(buf) as mv: - n=stream.readinto(mv[hdrMax-jumpTo:hdrMax]) - print('read',n,file=sys.stderr) - if n<jumpTo or n==0: - print('done',n,jumpTo,file=sys.stderr) - done=True - bp=0 + else: + bp+=length + print('end of loop',wtype,start_1,bp,eol,length,file=sys.stderr) #while not buf.startswith(b'\r\n',bp): OUT.write(b"=====\n") OUT.write(buf[0:100]) + if not buf[99]==10: + OUT.write(b"\n") continue return bv=memoryview(buf)[start_2:start_2+length]