# HG changeset patch # User Henry S. Thompson # Date 1688733563 -3600 # Node ID 55943918794e0b8cd61b8c2dba8af420bfa73261 # Parent 699ef141af10dd25877db14f85451ed56eb39c33 a bit better diff -r 699ef141af10 -r 55943918794e bin/warc.py --- a/bin/warc.py Thu Jul 06 14:53:28 2023 +0100 +++ b/bin/warc.py Fri Jul 07 13:39:23 2023 +0100 @@ -17,12 +17,10 @@ hdrMax=16*1024 buf=bytearray(bufSize) hdrBuf=memoryview(buf)[:hdrMax] - while not stream.closed: + fpos=0 + bl=stream.readinto(hdrBuf) + while True: bp=0 - fpos=stream.tell() - bl=stream.readinto(hdrBuf) - if bl==0: - break while buf.startswith(b'\r\n',bp): bp+=2 if not buf.startswith(b'WARC/1.0\r\n',bp): @@ -33,6 +31,7 @@ wtype=None length=None state=1 + done=False tr=None # Was this record truncated? while not buf.startswith(b'\r\n',bp): eol=buf.index(b'\r\n',bp)+2 @@ -44,22 +43,32 @@ elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol - start_2=eol+2 + start_2=bp=eol+2 # need to read more if bp+length>hdrMax if (wtype in types): if whole: pass # buf[bp:(bp:=bp+ln)]=l elif (parts & 1): - callback(wtype,buf[bob:start_2],1) - if parts==1: - stream.seek(fpos+(bp-bob)+length) - continue - else: - start_2=bp - else: - print(fpos,bp,bp-bob,length) - stream.seek(fpos+(bp-bob)+length) - continue + print('cb') + OUT=callback(wtype,buf[bob:eol],1) + sys.stdout.flush() + if parts!=1: + # everything from bv= goes here + pass + print(wtype,fpos,bp,bp-bob,length) + stream.seek(fpos:=fpos+(bp-bob)+length) + print(fpos) + if done: + return + buf[0:hdrMax-fpos]=buf[fpos:hdrMax] + n=stream.readinto(memoryview(buf)[fpos:hdrMax]) + if n