changeset 57:61b0a1582af8

works with all types, part=1
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 12 Jul 2023 18:48:27 +0100
parents f8c8f79b2532
children 299e3d0f2310
files bin/warc.py
diffstat 1 files changed, 15 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Mon Jul 10 19:52:18 2023 +0100
+++ b/bin/warc.py	Wed Jul 12 18:48:27 2023 +0100
@@ -42,30 +42,34 @@
       if buf.startswith(b"Content-Length: ",bp):
         length=wl=int(buf[bp+16:eol-2])
       elif buf.startswith(b"WARC-Truncated: ",bp):
-        tr=l[bp+16:eol-2]
-        tr="EMPTY" if tr=="" else tr
+        tr=bytes(buf[bp+16:eol-2])
+        tr=b"EMPTY" if tr==b"" else tr
       elif buf.startswith(b'WARC-Type: ',bp):
         wtype = bytes(buf[bp+11:eol-2])
       bp=eol
     bp=eol+2
-    if (bp+length)>bl:
-      if done:
-        raise ValueError("Done but need more! %s + %s > %s",bp,length,bl)
+    if done:
+      if (bp+length)>bl:
+        raise ValueError("Done but need more! %s + %s > %s"%(bp,
+                         length,bl))
+    elif (bp+(length+hdrMax))>bl:
       # Need more data
       if wtype in types:
         # we need to keep from start_1 to bl
         keepFrom=start_1
         keepLen=bl-keepFrom
-        buf[0:keepLen]=buf[keepFrom,bl]
+        buf[0:keepLen]=buf[keepFrom:bl]
       else:
         # we can skip the rest of this part
         keepLen=0
-        fpos=stream.seek(fpos+(pb+length-bl))
-      spaceToFill=bufMax-keepLen
-      with memoryview(buf)[keepLen:bufMax] as xBuf:
+        fpos=stream.seek(fpos+(bp+length-bl))
+      spaceToFill=bufSize-keepLen
+      with memoryview(buf)[keepLen:bufSize] as xBuf:
         nb=stream.readinto(xBuf)
       fpos+=nb
-      bp=keepLen
+      eol=eol-start_1
+      start_1=0
+      bp=eol+2
       bl=keepLen+nb
       if nb<spaceToFill:
         done=True
@@ -81,14 +85,9 @@
         pass
       else:
         bp+=length
-    print('end of loop',wtype,start_1,bp,eol,length,file=sys.stderr)
+    print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
     #while not buf.startswith(b'\r\n',bp):
-    OUT.write(b"=====\n")
-    OUT.write(buf[0:100])
-    if not buf[99]==10:
-      OUT.write(b"\n")
     continue
-    return
     bv=memoryview(buf)[start_2:start_2+length]
     ii=0
     while True and not stream.closed: