# HG changeset patch # User Henry S. Thompson # Date 1688745832 -3600 # Node ID c0b4359dd26a80ecc3c8ee22103c963371154ad9 # Parent 55943918794e0b8cd61b8c2dba8af420bfa73261 working better, gets confused by 3-part response diff -r 55943918794e -r c0b4359dd26a bin/warc.py --- a/bin/warc.py Fri Jul 07 13:39:23 2023 +0100 +++ b/bin/warc.py Fri Jul 07 17:03:52 2023 +0100 @@ -17,16 +17,15 @@ hdrMax=16*1024 buf=bytearray(bufSize) hdrBuf=memoryview(buf)[:hdrMax] - fpos=0 - bl=stream.readinto(hdrBuf) + fpos=bl=stream.readinto(hdrBuf) + bob=0 while True: bp=0 while buf.startswith(b'\r\n',bp): bp+=2 if not buf.startswith(b'WARC/1.0\r\n',bp): - raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, + raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) - bob=bp # in case 1 or whole bp+=10 wtype=None length=None @@ -43,27 +42,29 @@ elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol - start_2=bp=eol+2 - # need to read more if bp+length>hdrMax + bp=eol+2 if (wtype in types): if whole: pass # buf[bp:(bp:=bp+ln)]=l elif (parts & 1): print('cb') - OUT=callback(wtype,buf[bob:eol],1) - sys.stdout.flush() + OUT=callback(wtype,buf[bp:eol+length],1) if parts!=1: # everything from bv= goes here + # need to read more if eol+length>hdrMax pass - print(wtype,fpos,bp,bp-bob,length) - stream.seek(fpos:=fpos+(bp-bob)+length) - print(fpos) + print(wtype,bob,eol,length) + jumpTo=bp+length + buf[0:hdrMax-jumpTo]=buf[jumpTo:hdrMax] + stream.seek(fpos:=fpos+jumpTo) if done: - return - buf[0:hdrMax-fpos]=buf[fpos:hdrMax] - n=stream.readinto(memoryview(buf)[fpos:hdrMax]) - if n