Mercurial > hg > cc > cirrus_work
diff bin/warc.py @ 67:b8d4a5ede7a3
fix eof bug, expand error messages
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 20 Jul 2023 10:32:55 +0100 |
parents | 75f1d3bc60d9 |
children |
line wrap: on
line diff
--- a/bin/warc.py Wed Jul 19 13:20:46 2023 +0100 +++ b/bin/warc.py Thu Jul 20 10:32:55 2023 +0100 @@ -31,15 +31,17 @@ bp=0 done=False while True: - while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty + while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty bp+=2 start_1=bp if not buf.startswith(b'WARC/1.0\r\n',bp): if done and bl-bp==0: # really done return - raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, - buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) + raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, + bp,bl,fpos, + (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), + bl-bp)) bp+=10 wtype=None length=None @@ -66,15 +68,15 @@ elif buf.startswith(b'w',bp+11): wtype = INFO else: - raise ValueError("Unknown WARC-Type: %s at %s"%( - bytes(bufView[bp+11:eol-2]), + raise ValueError("Unknown WARC-Type: %s in %s at %s"%( + bytes(bufView[bp+11:eol-2]),filename, fpos-(bl-bp))) bp=eol bp=eol+2 if done: if (bp+length)>bl: - raise ValueError("Done but need more! %s + %s > %s"%(bp, - length,bl)) + raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, + length,bl,filename)) elif (bp+(length+hdrMax))>bl: # Need more data if wtype in types: @@ -163,5 +165,3 @@ L_start=rec_text.tell() bp+=length #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) - #while not buf.startswith(b'\r\n',bp): - continue