changeset 290:52c9d1875608

simple refill working?
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 09 Apr 2025 12:57:50 +0100
parents f17aef7ba4a7
children 70da637d1402
files lib/python/cc/warc.py
diffstat 1 files changed, 20 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/warc.py	Wed Apr 09 11:15:14 2025 +0100
+++ b/lib/python/cc/warc.py	Wed Apr 09 12:57:50 2025 +0100
@@ -19,7 +19,7 @@
                              # to 5.5MiB for CC-MAIN-2025-13 (Mar) and thereafter
 
 HDRMAX: int = 0  # will grow
-RECORDMAX int = 0  # will grow
+RECORDMAX: int = 0  # will grow
 
 def warc(filename: str,
          callback: typing.Callable[[bytes, typing.ByteString, int], typing.BinaryIO],
@@ -47,34 +47,31 @@
       stream = igzip.IGzipFile(filename=filename)
   buf: char[::1] = bytearray(BUFSIZE)
   bufView: char[::1] = memoryview(buf)
-  fpos: long = 0
-  bp: long = 0
-  bl: long = stream.readinto(buf)
+  fpos: int = 0
+  bp: int = 0
+  bl: int = stream.readinto(buf)
+  n: int = 0
   done: bool = bl < BUFSIZE 
+  while buf.startswith(b'\r\n',bp):
+    bp+=2
   while not (done and bl == bp):
-    while buf.startswith(b'\r\n',bp):
-      bp+=2
-    start_1: long = bp
-    if (bp > bl):
-      breakpoint()
+    start_1: int = bp
     if not buf.startswith(b'WARC/1.0\r\n',bp):
-      breakpoint()
       raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
                                                                    bp,bl,fpos,
          (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
                                                                      bl-bp))
-    bp+=10
-    wtype: bytes = b''
+    bp += 10
+    n += 1
+    wtype: int = -1
     length: int = 0
     tr: bytes = b'' # Was this record truncated?
-    if (bp > bl):
-      breakpoint()
     while not buf.startswith(b'\r\n',bp):
       # there should always be enough in the buffer to complete this loop,
       #  because of the buffer update logic at the end
       eol = buf.index(b'\r\n', bp)
       if buf.startswith(b"Content-Length: ",bp):
-        length=wl=int(bufView[bp+16:eol-2])
+        length=wl=int(bufView[bp+16:eol])
       if buf.startswith(b"WARC-Truncated: ",bp):
         if bp+16==eol-2:
           tr = b"EMPTY"
@@ -93,11 +90,8 @@
           raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
                              bytes(bufView[bp+11:eol-2]),filename,
                              fpos-(bl-bp)))
-      bp=eol
+      bp=eol+2
     # record header done
-    bp=eol+2
-    if (bp > bl):
-      breakpoint()
     if (hl:=(bp - start_1)) > HDRMAX:
       HDRMAX = hl
     #if done:
@@ -112,14 +106,14 @@
         continue
       elif (parts & 1):
         _out=callback(wtype,bufView[start_1:eol],1)
+        bp = eol
+      while buf.startswith(b'\r\n',bp):
+        bp+=2
       if parts!=1:
-        while buf.startswith(b'\r\n',bp):
-          bp+=2
         start_2=bp
         eob=bp+length
         while buf.startswith(b'\r\n',eob-2):
           eob-=2
-        
         # Only output parts (2 = HTTP header, 4 = body) that are wanted
         if parts & 2:
           if wtype == META or wtype == INFO:
@@ -144,6 +138,8 @@
       bl = keepLen+nb
       done = bl < BUFSIZE 
       bp = 0
-      
+    while buf.startswith(b'\r\n',bp):
+      bp+=2
     #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
-    print('Max record: %d, Max header: %d
+  print('%d records, max record: %d, max header: %d'%(n, RECORDMAX, HDRMAX),
+        file=sys.stderr)