Mercurial > hg > cc > cirrus_work
diff bin/warc.py @ 51:c0b4359dd26a
working better, gets confused by 3-part response
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 17:03:52 +0100 |
parents | 55943918794e |
children | 0dc144bd027c |
line wrap: on
line diff
--- a/bin/warc.py Fri Jul 07 13:39:23 2023 +0100 +++ b/bin/warc.py Fri Jul 07 17:03:52 2023 +0100 @@ -17,16 +17,15 @@ hdrMax=16*1024 buf=bytearray(bufSize) hdrBuf=memoryview(buf)[:hdrMax] - fpos=0 - bl=stream.readinto(hdrBuf) + fpos=bl=stream.readinto(hdrBuf) + bob=0 while True: bp=0 while buf.startswith(b'\r\n',bp): bp+=2 if not buf.startswith(b'WARC/1.0\r\n',bp): - raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, + raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) - bob=bp # in case 1 or whole bp+=10 wtype=None length=None @@ -43,27 +42,29 @@ elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol - start_2=bp=eol+2 - # need to read more if bp+length>hdrMax + bp=eol+2 if (wtype in types): if whole: pass # buf[bp:(bp:=bp+ln)]=l elif (parts & 1): print('cb') - OUT=callback(wtype,buf[bob:eol],1) - sys.stdout.flush() + OUT=callback(wtype,buf[bp:eol+length],1) if parts!=1: # everything from bv= goes here + # need to read more if eol+length>hdrMax pass - print(wtype,fpos,bp,bp-bob,length) - stream.seek(fpos:=fpos+(bp-bob)+length) - print(fpos) + print(wtype,bob,eol,length) + jumpTo=bp+length + buf[0:hdrMax-jumpTo]=buf[jumpTo:hdrMax] + stream.seek(fpos:=fpos+jumpTo) if done: - return - buf[0:hdrMax-fpos]=buf[fpos:hdrMax] - n=stream.readinto(memoryview(buf)[fpos:hdrMax]) - if n<hdrMax-fpos or n==0: + continue + n=stream.readinto(memoryview(buf)[hdrMax-jumpTo:hdrMax]) + print('read',n) + if n<jumpTo or n==0: + print('done',n,jumpTo) done=True + bp=0 #while not buf.startswith(b'\r\n',bp): OUT.write(b"=====\n") OUT.write(buf[0:100])