changeset 288:d3fc7b5c73d0

park that, try fixed large buffer and large-enough min to ensure we always have a whole record in view
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 08 Apr 2025 16:06:33 +0100
parents fe78af4ea7c5
children f17aef7ba4a7
files lib/python/cc/warc.py
diffstat 1 files changed, 20 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/warc.py	Mon Apr 07 16:34:31 2025 +0100
+++ b/lib/python/cc/warc.py	Tue Apr 08 16:06:33 2025 +0100
@@ -14,15 +14,18 @@
 META: bytes =  b'metadata'
 INFO: bytes =  b'warcinfo'
 
-BUFSIZE = cython.declare(cython.long, 2*1024*1024)
+BUFSIZE = cython.declare(cython.long, 16 * 1024 * 1024)
+BUFMIN: int = 3 * 512 * 1024 # 1.5MiB, will need to be increased
+                             # to 5.5MiB for CC-MAIN-2025-13 (Mar) and thereafter
 
-HDRMAX: int = 32*1024  # Not really max, there are some enormous ones, see below
+HDRMAX: int = 0  # will grow
+ITEMMAX int = 0  # will grow
 
 def refill(buf: typing.ByteString, bufView: typing.ByteString, stream: typing.BinaryIO,
            start_1: int, bl: int, bp: int, eol: int,
         length: int, needed: bool) -> tuple[int, int, int, bytes,
                                             int, typing.ByteString, bool]:
-  global BUFSIZE
+  global BUFSIZE, BUFMIN, HDRMAX, ITEMMAX
   whole: int
   xBuf: char[::1]
   #if (stream.tell() > 5766470000): # 82535
@@ -76,7 +79,12 @@
      1 for warc header;
      2 for req/resp HTTP header, warcinfo/metadata features;
      4 for req/resp body'''
-  global BUFSIZE, HDRMAX
+  # Not currently trying to depend on this, but I believe that
+  #   warcinfo: warc-headers+1bl+crawl-headers+2bl
+  #   request: warc-headers+1bl+HTTP-headers+3bl
+  #   response: warc-headers+1bl+HTTP-headers+[1bl or 2bl]+HTTP-body+1bl
+  #   metadata: warc-headers+1bl+metadata-headers+3bl
+q  global BUFSIZE, HDRMAX
   # should do some sanity checking wrt parts and types
   if filename.endswith(".gz"):
     stream: typing.BinaryIO = igzip.IGzipFile(filename=filename)
@@ -99,13 +107,13 @@
       if clh_end > bl:
         raise ValueError
       length = wl = int(bufView[clh_begin+18:clh_end])
-      # There are some enormous TargetURIs which overflow HDRMAX
-      # so check whether we can see to the _end_ of the Warc-header
+      # Check whether we can see to the _end_ of the Warc-header
       eowh = buf.index(b'\r\n\r\n', clh_end)
       if eowh > bl:
         raise ValueError
     except ValueError:
-      # No! So we do an emergency buffer shift, forcing the restart
+      # We can't see to the end of this item
+      #  So we do a buffer shift, forcing the restart
       #  because skipping won't work as we're not at the end of the WARC
       #  header yet
       start_1, bp, _, buf, bl, bufView, done = refill(buf, bufView, stream,
@@ -124,10 +132,10 @@
          (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
                                                                      bl-bp))
     bp+=10
-    wtype: cython.bytes = b''
+    wtype: bytes = b''
     length: int = 0
     state: int = 1
-    tr: cython.bytes = b'' # Was this record truncated?
+    tr: bytes = b'' # Was this record truncated?
     if (bp > bl):
       breakpoint()
     while not buf.startswith(b'\r\n',bp):
@@ -154,20 +162,15 @@
                              fpos-(bl-bp)))
       bp=eol
     bp=eol+2
-    OUT: typing.BinaryIO
     if (bp > bl):
       breakpoint()
+    if (hl:=(bp - start_1)) > HDRMAX:
+      HDRMAX = hl
+    OUT: typing.BinaryIO
     if done:
       if (bp+length)>bl:
         raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
                          length,bl,filename))
-    elif (bp+(length+HDRMAX))>bl:
-      # Need more data
-      start_1, bp, eol, buf, bl, bufView, done = refill(buf, bufView, stream,
-                                                        start_1, bl, bp, eol,
-                                                        length, wtype in types)
-      if wtype not in types:
-        continue
     if (wtype in types):
       # Output whole or part 1 as required
       if whole: