Mercurial > hg > cc > cirrus_work
changeset 288:d3fc7b5c73d0
park that, try fixed large buffer and large-enough min to ensure we always have a whole record in view
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Tue, 08 Apr 2025 16:06:33 +0100 |
| parents | fe78af4ea7c5 |
| children | f17aef7ba4a7 |
| files | lib/python/cc/warc.py |
| diffstat | 1 files changed, 20 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/warc.py Mon Apr 07 16:34:31 2025 +0100 +++ b/lib/python/cc/warc.py Tue Apr 08 16:06:33 2025 +0100 @@ -14,15 +14,18 @@ META: bytes = b'metadata' INFO: bytes = b'warcinfo' -BUFSIZE = cython.declare(cython.long, 2*1024*1024) +BUFSIZE = cython.declare(cython.long, 16 * 1024 * 1024) +BUFMIN: int = 3 * 512 * 1024 # 1.5MiB, will need to be increased + # to 5.5MiB for CC-MAIN-2025-13 (Mar) and thereafter -HDRMAX: int = 32*1024 # Not really max, there are some enormous ones, see below +HDRMAX: int = 0 # will grow +ITEMMAX int = 0 # will grow def refill(buf: typing.ByteString, bufView: typing.ByteString, stream: typing.BinaryIO, start_1: int, bl: int, bp: int, eol: int, length: int, needed: bool) -> tuple[int, int, int, bytes, int, typing.ByteString, bool]: - global BUFSIZE + global BUFSIZE, BUFMIN, HDRMAX, ITEMMAX whole: int xBuf: char[::1] #if (stream.tell() > 5766470000): # 82535 @@ -76,7 +79,12 @@ 1 for warc header; 2 for req/resp HTTP header, warcinfo/metadata features; 4 for req/resp body''' - global BUFSIZE, HDRMAX + # Not currently trying to depend on this, but I believe that + # warcinfo: warc-headers+1bl+crawl-headers+2bl + # request: warc-headers+1bl+HTTP-headers+3bl + # response: warc-headers+1bl+HTTP-headers+[1bl or 2bl]+HTTP-body+1bl + # metadata: warc-headers+1bl+metadata-headers+3bl +q global BUFSIZE, HDRMAX # should do some sanity checking wrt parts and types if filename.endswith(".gz"): stream: typing.BinaryIO = igzip.IGzipFile(filename=filename) @@ -99,13 +107,13 @@ if clh_end > bl: raise ValueError length = wl = int(bufView[clh_begin+18:clh_end]) - # There are some enormous TargetURIs which overflow HDRMAX - # so check whether we can see to the _end_ of the Warc-header + # Check whether we can see to the _end_ of the Warc-header eowh = buf.index(b'\r\n\r\n', clh_end) if eowh > bl: raise ValueError except ValueError: - # No! So we do an emergency buffer shift, forcing the restart + # We can't see to the end of this item + # So we do a buffer shift, forcing the restart # because skipping won't work as we're not at the end of the WARC # header yet start_1, bp, _, buf, bl, bufView, done = refill(buf, bufView, stream, @@ -124,10 +132,10 @@ (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), bl-bp)) bp+=10 - wtype: cython.bytes = b'' + wtype: bytes = b'' length: int = 0 state: int = 1 - tr: cython.bytes = b'' # Was this record truncated? + tr: bytes = b'' # Was this record truncated? if (bp > bl): breakpoint() while not buf.startswith(b'\r\n',bp): @@ -154,20 +162,15 @@ fpos-(bl-bp))) bp=eol bp=eol+2 - OUT: typing.BinaryIO if (bp > bl): breakpoint() + if (hl:=(bp - start_1)) > HDRMAX: + HDRMAX = hl + OUT: typing.BinaryIO if done: if (bp+length)>bl: raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, length,bl,filename)) - elif (bp+(length+HDRMAX))>bl: - # Need more data - start_1, bp, eol, buf, bl, bufView, done = refill(buf, bufView, stream, - start_1, bl, bp, eol, - length, wtype in types) - if wtype not in types: - continue if (wtype in types): # Output whole or part 1 as required if whole:
