changeset 56:f8c8f79b2532

rework completely to refill as much as possible only when necessary, basic loop working again, but not refill
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 10 Jul 2023 19:52:18 +0100
parents 11a886a84a49
children 61b0a1582af8
files bin/warc.py
diffstat 1 files changed, 41 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Mon Jul 10 18:17:35 2023 +0100
+++ b/bin/warc.py	Mon Jul 10 19:52:18 2023 +0100
@@ -16,23 +16,28 @@
   bufSize=2*1024*1024
   hdrMax=16*1024
   buf=bytearray(bufSize)
-  with memoryview(buf)[:hdrMax] as hdrBuf:
-    fpos=bl=stream.readinto(hdrBuf)
+  #with memoryview(buf)[:hdrMax] as hdrBuf:
+  fpos=bl=stream.readinto(buf)
+  bp=0
+  done=False
   while True:
-    bp=0
-    while buf.startswith(b'\r\n',bp):
+    while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty 
       bp+=2
-    bob=bp
+    start_1=bp
     if not buf.startswith(b'WARC/1.0\r\n',bp):
+      if done and bl-bp==0:
+        # really done
+        return
       raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
                        buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
     bp+=10
     wtype=None
     length=None
     state=1
-    done=False
     tr=None # Was this record truncated?
     while not buf.startswith(b'\r\n',bp):
+      # there should always be enough in the buffer to complete this loop,
+      #  because of the buffer update logic below
       eol=buf.index(b'\r\n',bp)+2
       if buf.startswith(b"Content-Length: ",bp):
         length=wl=int(buf[bp+16:eol-2])
@@ -43,33 +48,45 @@
         wtype = bytes(buf[bp+11:eol-2])
       bp=eol
     bp=eol+2
+    if (bp+length)>bl:
+      if done:
+        raise ValueError("Done but need more! %s + %s > %s",bp,length,bl)
+      # Need more data
+      if wtype in types:
+        # we need to keep from start_1 to bl
+        keepFrom=start_1
+        keepLen=bl-keepFrom
+        buf[0:keepLen]=buf[keepFrom,bl]
+      else:
+        # we can skip the rest of this part
+        keepLen=0
+        fpos=stream.seek(fpos+(pb+length-bl))
+      spaceToFill=bufMax-keepLen
+      with memoryview(buf)[keepLen:bufMax] as xBuf:
+        nb=stream.readinto(xBuf)
+      fpos+=nb
+      bp=keepLen
+      bl=keepLen+nb
+      if nb<spaceToFill:
+        done=True
+      if wtype not in types:
+        continue
     if (wtype in types):
       if whole:
-        pass # buf[bp:(bp:=bp+ln)]=l
+        pass # buf[bp:(bp:=bp+ln)]=l @fixme
       elif (parts & 1):
-        OUT=callback(wtype,buf[bob:eol],1)
+        OUT=callback(wtype,buf[start_1:eol],1)
       if parts!=1:
         # everything from bv= goes here
-        # need to read more if eol+length>hdrMax
         pass
-    print(wtype,bob,bp,eol,length,file=sys.stderr)
-    jumpTo=bp+length
-    buf[0:jumpTo]=buf[jumpTo:hdrMax]
-    _fpos=stream.seek(fpos:=fpos+jumpTo)
-    print('fp',_fpos,fpos,file=sys.stderr)
-    if done:
-      print('finished',file=sys.stderr)
-      break
-    with memoryview(buf) as mv:
-      n=stream.readinto(mv[hdrMax-jumpTo:hdrMax])
-    print('read',n,file=sys.stderr)
-    if n<jumpTo or n==0:
-      print('done',n,jumpTo,file=sys.stderr)
-      done=True
-    bp=0
+      else:
+        bp+=length
+    print('end of loop',wtype,start_1,bp,eol,length,file=sys.stderr)
     #while not buf.startswith(b'\r\n',bp):
     OUT.write(b"=====\n")
     OUT.write(buf[0:100])
+    if not buf[99]==10:
+      OUT.write(b"\n")
     continue
     return
     bv=memoryview(buf)[start_2:start_2+length]