changeset 49:699ef141af10

just barely working for 1, need to rethink buffering
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 06 Jul 2023 14:53:28 +0100
parents d0d2fd9830d6
children 55943918794e
files bin/warc.py
diffstat 1 files changed, 11 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Thu Jul 06 13:27:33 2023 +0100
+++ b/bin/warc.py	Thu Jul 06 14:53:28 2023 +0100
@@ -19,13 +19,14 @@
   hdrBuf=memoryview(buf)[:hdrMax]
   while not stream.closed:
     bp=0
+    fpos=stream.tell()
     bl=stream.readinto(hdrBuf)
     if bl==0:
       break
     while buf.startswith(b'\r\n',bp):
       bp+=2
     if not buf.startswith(b'WARC/1.0\r\n',bp):
-      raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
+      raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos,
                        buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
     bob=bp # in case 1 or whole
     bp+=10
@@ -34,7 +35,6 @@
     state=1
     tr=None # Was this record truncated?
     while not buf.startswith(b'\r\n',bp):
-      print('yes',)
       eol=buf.index(b'\r\n',bp)+2
       if buf.startswith(b"Content-Length: ",bp):
         length=wl=int(buf[bp+16:eol-2])
@@ -44,18 +44,22 @@
       elif buf.startswith(b'WARC-Type: ',bp):
         wtype = bytes(buf[bp+11:eol-2])
       bp=eol
-    start_2=eol
+    start_2=eol+2
+    # need to read more if bp+length>hdrMax
     if (wtype in types):
       if whole:
-        buf[bp:(bp:=bp+ln)]=l
+        pass # buf[bp:(bp:=bp+ln)]=l
       elif (parts & 1):
-        callback(wtype,buf[:start_2],1)
+        callback(wtype,buf[bob:start_2],1)
       if parts==1:
-        start_2=0
+        stream.seek(fpos+(bp-bob)+length)
+        continue
       else:
         start_2=bp
     else:
-      start_2=0
+      print(fpos,bp,bp-bob,length)
+      stream.seek(fpos+(bp-bob)+length)
+      continue
     bv=memoryview(buf)[start_2:start_2+length]
     ii=0
     while True and not stream.closed: