Mercurial > hg > cc > cirrus_work
changeset 48:d0d2fd9830d6
starting on conversion to direct-querying of buffer
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 06 Jul 2023 13:27:33 +0100 |
parents | b59f49909bda |
children | 699ef141af10 |
files | bin/warc.py |
diffstat | 1 files changed, 24 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Thu Jul 06 10:19:02 2023 +0100 +++ b/bin/warc.py Thu Jul 06 13:27:33 2023 +0100 @@ -13,37 +13,38 @@ stream=igzip.IGzipFile(filename=filename) else: stream=open(filename,'rb',0) - bufsize=2*1024*1024 - buf=bytearray(bufsize) - l=b'\r\n' + bufSize=2*1024*1024 + hdrMax=16*1024 + buf=bytearray(bufSize) + hdrBuf=memoryview(buf)[:hdrMax] while not stream.closed: bp=0 - while l==b'\r\n': - l=stream.readline() - nb+=(ln:=len(l)) - if ln==0: + bl=stream.readinto(hdrBuf) + if bl==0: break - if l!=b'WARC/1.0\r\n': - raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), - l.decode('latin-1'),len(l))) + while buf.startswith(b'\r\n',bp): + bp+=2 + if not buf.startswith(b'WARC/1.0\r\n',bp): + raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, + buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) + bob=bp # in case 1 or whole + bp+=10 wtype=None length=None state=1 tr=None # Was this record truncated? - while l!=b'\r\n': - # WARC header - if parts & 1: - buf[bp:(bp:=bp+ln)]=l - l=stream.readline() - nb+=(ln:=len(l)) - if l.startswith(b"Content-Length: "): - length=wl=int(l[16:].rstrip()) - elif l.startswith(b"WARC-Truncated: "): - tr=l[16:].rstrip() + while not buf.startswith(b'\r\n',bp): + print('yes',) + eol=buf.index(b'\r\n',bp)+2 + if buf.startswith(b"Content-Length: ",bp): + length=wl=int(buf[bp+16:eol-2]) + elif buf.startswith(b"WARC-Truncated: ",bp): + tr=l[bp+16:eol-2] tr="EMPTY" if tr=="" else tr - elif l.startswith(b'WARC-Type: '): - wtype = l[11:-2] - start_2=bp + elif buf.startswith(b'WARC-Type: ',bp): + wtype = bytes(buf[bp+11:eol-2]) + bp=eol + start_2=eol if (wtype in types): if whole: buf[bp:(bp:=bp+ln)]=l