# HG changeset patch # User Henry S. Thompson # Date 1689184107 -3600 # Node ID 61b0a1582af83eecf42295accf48c929e8cec4f5 # Parent f8c8f79b253262991c2a1e5b0cb12181fc383328 works with all types, part=1 diff -r f8c8f79b2532 -r 61b0a1582af8 bin/warc.py --- a/bin/warc.py Mon Jul 10 19:52:18 2023 +0100 +++ b/bin/warc.py Wed Jul 12 18:48:27 2023 +0100 @@ -42,30 +42,34 @@ if buf.startswith(b"Content-Length: ",bp): length=wl=int(buf[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): - tr=l[bp+16:eol-2] - tr="EMPTY" if tr=="" else tr + tr=bytes(buf[bp+16:eol-2]) + tr=b"EMPTY" if tr==b"" else tr elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol bp=eol+2 - if (bp+length)>bl: - if done: - raise ValueError("Done but need more! %s + %s > %s",bp,length,bl) + if done: + if (bp+length)>bl: + raise ValueError("Done but need more! %s + %s > %s"%(bp, + length,bl)) + elif (bp+(length+hdrMax))>bl: # Need more data if wtype in types: # we need to keep from start_1 to bl keepFrom=start_1 keepLen=bl-keepFrom - buf[0:keepLen]=buf[keepFrom,bl] + buf[0:keepLen]=buf[keepFrom:bl] else: # we can skip the rest of this part keepLen=0 - fpos=stream.seek(fpos+(pb+length-bl)) - spaceToFill=bufMax-keepLen - with memoryview(buf)[keepLen:bufMax] as xBuf: + fpos=stream.seek(fpos+(bp+length-bl)) + spaceToFill=bufSize-keepLen + with memoryview(buf)[keepLen:bufSize] as xBuf: nb=stream.readinto(xBuf) fpos+=nb - bp=keepLen + eol=eol-start_1 + start_1=0 + bp=eol+2 bl=keepLen+nb if nb