diff bin/warc.py @ 51:c0b4359dd26a

working better, gets confused by 3-part response
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 07 Jul 2023 17:03:52 +0100
parents 55943918794e
children 0dc144bd027c
line wrap: on
line diff
--- a/bin/warc.py	Fri Jul 07 13:39:23 2023 +0100
+++ b/bin/warc.py	Fri Jul 07 17:03:52 2023 +0100
@@ -17,16 +17,15 @@
   hdrMax=16*1024
   buf=bytearray(bufSize)
   hdrBuf=memoryview(buf)[:hdrMax]
-  fpos=0
-  bl=stream.readinto(hdrBuf)
+  fpos=bl=stream.readinto(hdrBuf)
+  bob=0
   while True:
     bp=0
     while buf.startswith(b'\r\n',bp):
       bp+=2
     if not buf.startswith(b'WARC/1.0\r\n',bp):
-      raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos,
+      raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
                        buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
-    bob=bp # in case 1 or whole
     bp+=10
     wtype=None
     length=None
@@ -43,27 +42,29 @@
       elif buf.startswith(b'WARC-Type: ',bp):
         wtype = bytes(buf[bp+11:eol-2])
       bp=eol
-    start_2=bp=eol+2
-    # need to read more if bp+length>hdrMax
+    bp=eol+2
     if (wtype in types):
       if whole:
         pass # buf[bp:(bp:=bp+ln)]=l
       elif (parts & 1):
         print('cb')
-        OUT=callback(wtype,buf[bob:eol],1)
-        sys.stdout.flush()
+        OUT=callback(wtype,buf[bp:eol+length],1)
       if parts!=1:
         # everything from bv= goes here
+        # need to read more if eol+length>hdrMax
         pass
-    print(wtype,fpos,bp,bp-bob,length)
-    stream.seek(fpos:=fpos+(bp-bob)+length)
-    print(fpos)
+    print(wtype,bob,eol,length)
+    jumpTo=bp+length
+    buf[0:hdrMax-jumpTo]=buf[jumpTo:hdrMax]
+    stream.seek(fpos:=fpos+jumpTo)
     if done:
-      return
-    buf[0:hdrMax-fpos]=buf[fpos:hdrMax]
-    n=stream.readinto(memoryview(buf)[fpos:hdrMax])
-    if n<hdrMax-fpos or n==0:
+      continue
+    n=stream.readinto(memoryview(buf)[hdrMax-jumpTo:hdrMax])
+    print('read',n)
+    if n<jumpTo or n==0:
+      print('done',n,jumpTo)
       done=True
+    bp=0
     #while not buf.startswith(b'\r\n',bp):
     OUT.write(b"=====\n")
     OUT.write(buf[0:100])