Mercurial > hg > cc > cirrus_work
changeset 49:699ef141af10
just barely working for 1, need to rethink buffering
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 06 Jul 2023 14:53:28 +0100 |
parents | d0d2fd9830d6 |
children | 55943918794e |
files | bin/warc.py |
diffstat | 1 files changed, 11 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Thu Jul 06 13:27:33 2023 +0100 +++ b/bin/warc.py Thu Jul 06 14:53:28 2023 +0100 @@ -19,13 +19,14 @@ hdrBuf=memoryview(buf)[:hdrMax] while not stream.closed: bp=0 + fpos=stream.tell() bl=stream.readinto(hdrBuf) if bl==0: break while buf.startswith(b'\r\n',bp): bp+=2 if not buf.startswith(b'WARC/1.0\r\n',bp): - raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, + raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) bob=bp # in case 1 or whole bp+=10 @@ -34,7 +35,6 @@ state=1 tr=None # Was this record truncated? while not buf.startswith(b'\r\n',bp): - print('yes',) eol=buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): length=wl=int(buf[bp+16:eol-2]) @@ -44,18 +44,22 @@ elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol - start_2=eol + start_2=eol+2 + # need to read more if bp+length>hdrMax if (wtype in types): if whole: - buf[bp:(bp:=bp+ln)]=l + pass # buf[bp:(bp:=bp+ln)]=l elif (parts & 1): - callback(wtype,buf[:start_2],1) + callback(wtype,buf[bob:start_2],1) if parts==1: - start_2=0 + stream.seek(fpos+(bp-bob)+length) + continue else: start_2=bp else: - start_2=0 + print(fpos,bp,bp-bob,length) + stream.seek(fpos+(bp-bob)+length) + continue bv=memoryview(buf)[start_2:start_2+length] ii=0 while True and not stream.closed: