changeset 50:55943918794e

a bit better
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 07 Jul 2023 13:39:23 +0100
parents 699ef141af10
children c0b4359dd26a
files bin/warc.py
diffstat 1 files changed, 25 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Thu Jul 06 14:53:28 2023 +0100
+++ b/bin/warc.py	Fri Jul 07 13:39:23 2023 +0100
@@ -17,12 +17,10 @@
   hdrMax=16*1024
   buf=bytearray(bufSize)
   hdrBuf=memoryview(buf)[:hdrMax]
-  while not stream.closed:
+  fpos=0
+  bl=stream.readinto(hdrBuf)
+  while True:
     bp=0
-    fpos=stream.tell()
-    bl=stream.readinto(hdrBuf)
-    if bl==0:
-      break
     while buf.startswith(b'\r\n',bp):
       bp+=2
     if not buf.startswith(b'WARC/1.0\r\n',bp):
@@ -33,6 +31,7 @@
     wtype=None
     length=None
     state=1
+    done=False
     tr=None # Was this record truncated?
     while not buf.startswith(b'\r\n',bp):
       eol=buf.index(b'\r\n',bp)+2
@@ -44,22 +43,32 @@
       elif buf.startswith(b'WARC-Type: ',bp):
         wtype = bytes(buf[bp+11:eol-2])
       bp=eol
-    start_2=eol+2
+    start_2=bp=eol+2
     # need to read more if bp+length>hdrMax
     if (wtype in types):
       if whole:
         pass # buf[bp:(bp:=bp+ln)]=l
       elif (parts & 1):
-        callback(wtype,buf[bob:start_2],1)
-      if parts==1:
-        stream.seek(fpos+(bp-bob)+length)
-        continue
-      else:
-        start_2=bp
-    else:
-      print(fpos,bp,bp-bob,length)
-      stream.seek(fpos+(bp-bob)+length)
-      continue
+        print('cb')
+        OUT=callback(wtype,buf[bob:eol],1)
+        sys.stdout.flush()
+      if parts!=1:
+        # everything from bv= goes here
+        pass
+    print(wtype,fpos,bp,bp-bob,length)
+    stream.seek(fpos:=fpos+(bp-bob)+length)
+    print(fpos)
+    if done:
+      return
+    buf[0:hdrMax-fpos]=buf[fpos:hdrMax]
+    n=stream.readinto(memoryview(buf)[fpos:hdrMax])
+    if n<hdrMax-fpos or n==0:
+      done=True
+    #while not buf.startswith(b'\r\n',bp):
+    OUT.write(b"=====\n")
+    OUT.write(buf[0:100])
+    continue
+    return
     bv=memoryview(buf)[start_2:start_2+length]
     ii=0
     while True and not stream.closed: