diff bin/warc.py @ 67:b8d4a5ede7a3

fix eof bug, expand error messages
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 20 Jul 2023 10:32:55 +0100
parents 75f1d3bc60d9
children
line wrap: on
line diff
--- a/bin/warc.py	Wed Jul 19 13:20:46 2023 +0100
+++ b/bin/warc.py	Thu Jul 20 10:32:55 2023 +0100
@@ -31,15 +31,17 @@
   bp=0
   done=False
   while True:
-    while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty 
+    while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty
       bp+=2
     start_1=bp
     if not buf.startswith(b'WARC/1.0\r\n',bp):
       if done and bl-bp==0:
         # really done
         return
-      raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
-                       buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
+      raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
+                                                                   bp,bl,fpos,
+         (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
+                                                                     bl-bp))
     bp+=10
     wtype=None
     length=None
@@ -66,15 +68,15 @@
         elif buf.startswith(b'w',bp+11):
           wtype = INFO
         else:
-          raise ValueError("Unknown WARC-Type: %s at %s"%(
-                             bytes(bufView[bp+11:eol-2]),
+          raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
+                             bytes(bufView[bp+11:eol-2]),filename,
                              fpos-(bl-bp)))
       bp=eol
     bp=eol+2
     if done:
       if (bp+length)>bl:
-        raise ValueError("Done but need more! %s + %s > %s"%(bp,
-                         length,bl))
+        raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
+                         length,bl,filename))
     elif (bp+(length+hdrMax))>bl:
       # Need more data
       if wtype in types:
@@ -163,5 +165,3 @@
               L_start=rec_text.tell()
     bp+=length
     #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
-    #while not buf.startswith(b'\r\n',bp):
-    continue