Mercurial > hg > cc > cirrus_work
changeset 290:52c9d1875608
simple refill working?
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Wed, 09 Apr 2025 12:57:50 +0100 |
| parents | f17aef7ba4a7 |
| children | 70da637d1402 |
| files | lib/python/cc/warc.py |
| diffstat | 1 files changed, 20 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/warc.py Wed Apr 09 11:15:14 2025 +0100 +++ b/lib/python/cc/warc.py Wed Apr 09 12:57:50 2025 +0100 @@ -19,7 +19,7 @@ # to 5.5MiB for CC-MAIN-2025-13 (Mar) and thereafter HDRMAX: int = 0 # will grow -RECORDMAX int = 0 # will grow +RECORDMAX: int = 0 # will grow def warc(filename: str, callback: typing.Callable[[bytes, typing.ByteString, int], typing.BinaryIO], @@ -47,34 +47,31 @@ stream = igzip.IGzipFile(filename=filename) buf: char[::1] = bytearray(BUFSIZE) bufView: char[::1] = memoryview(buf) - fpos: long = 0 - bp: long = 0 - bl: long = stream.readinto(buf) + fpos: int = 0 + bp: int = 0 + bl: int = stream.readinto(buf) + n: int = 0 done: bool = bl < BUFSIZE + while buf.startswith(b'\r\n',bp): + bp+=2 while not (done and bl == bp): - while buf.startswith(b'\r\n',bp): - bp+=2 - start_1: long = bp - if (bp > bl): - breakpoint() + start_1: int = bp if not buf.startswith(b'WARC/1.0\r\n',bp): - breakpoint() raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, bp,bl,fpos, (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), bl-bp)) - bp+=10 - wtype: bytes = b'' + bp += 10 + n += 1 + wtype: int = -1 length: int = 0 tr: bytes = b'' # Was this record truncated? - if (bp > bl): - breakpoint() while not buf.startswith(b'\r\n',bp): # there should always be enough in the buffer to complete this loop, # because of the buffer update logic at the end eol = buf.index(b'\r\n', bp) if buf.startswith(b"Content-Length: ",bp): - length=wl=int(bufView[bp+16:eol-2]) + length=wl=int(bufView[bp+16:eol]) if buf.startswith(b"WARC-Truncated: ",bp): if bp+16==eol-2: tr = b"EMPTY" @@ -93,11 +90,8 @@ raise ValueError("Unknown WARC-Type: %s in %s at %s"%( bytes(bufView[bp+11:eol-2]),filename, fpos-(bl-bp))) - bp=eol + bp=eol+2 # record header done - bp=eol+2 - if (bp > bl): - breakpoint() if (hl:=(bp - start_1)) > HDRMAX: HDRMAX = hl #if done: @@ -112,14 +106,14 @@ continue elif (parts & 1): _out=callback(wtype,bufView[start_1:eol],1) + bp = eol + while buf.startswith(b'\r\n',bp): + bp+=2 if parts!=1: - while buf.startswith(b'\r\n',bp): - bp+=2 start_2=bp eob=bp+length while buf.startswith(b'\r\n',eob-2): eob-=2 - # Only output parts (2 = HTTP header, 4 = body) that are wanted if parts & 2: if wtype == META or wtype == INFO: @@ -144,6 +138,8 @@ bl = keepLen+nb done = bl < BUFSIZE bp = 0 - + while buf.startswith(b'\r\n',bp): + bp+=2 #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) - print('Max record: %d, Max header: %d + print('%d records, max record: %d, max header: %d'%(n, RECORDMAX, HDRMAX), + file=sys.stderr)
