Mercurial > hg > cc > cirrus_work
changeset 57:61b0a1582af8
works with all types, part=1
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 12 Jul 2023 18:48:27 +0100 |
parents | f8c8f79b2532 |
children | 299e3d0f2310 |
files | bin/warc.py |
diffstat | 1 files changed, 15 insertions(+), 16 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Mon Jul 10 19:52:18 2023 +0100 +++ b/bin/warc.py Wed Jul 12 18:48:27 2023 +0100 @@ -42,30 +42,34 @@ if buf.startswith(b"Content-Length: ",bp): length=wl=int(buf[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): - tr=l[bp+16:eol-2] - tr="EMPTY" if tr=="" else tr + tr=bytes(buf[bp+16:eol-2]) + tr=b"EMPTY" if tr==b"" else tr elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol bp=eol+2 - if (bp+length)>bl: - if done: - raise ValueError("Done but need more! %s + %s > %s",bp,length,bl) + if done: + if (bp+length)>bl: + raise ValueError("Done but need more! %s + %s > %s"%(bp, + length,bl)) + elif (bp+(length+hdrMax))>bl: # Need more data if wtype in types: # we need to keep from start_1 to bl keepFrom=start_1 keepLen=bl-keepFrom - buf[0:keepLen]=buf[keepFrom,bl] + buf[0:keepLen]=buf[keepFrom:bl] else: # we can skip the rest of this part keepLen=0 - fpos=stream.seek(fpos+(pb+length-bl)) - spaceToFill=bufMax-keepLen - with memoryview(buf)[keepLen:bufMax] as xBuf: + fpos=stream.seek(fpos+(bp+length-bl)) + spaceToFill=bufSize-keepLen + with memoryview(buf)[keepLen:bufSize] as xBuf: nb=stream.readinto(xBuf) fpos+=nb - bp=keepLen + eol=eol-start_1 + start_1=0 + bp=eol+2 bl=keepLen+nb if nb<spaceToFill: done=True @@ -81,14 +85,9 @@ pass else: bp+=length - print('end of loop',wtype,start_1,bp,eol,length,file=sys.stderr) + print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) #while not buf.startswith(b'\r\n',bp): - OUT.write(b"=====\n") - OUT.write(buf[0:100]) - if not buf[99]==10: - OUT.write(b"\n") continue - return bv=memoryview(buf)[start_2:start_2+length] ii=0 while True and not stream.closed: