comparison bin/warc.py @ 67:b8d4a5ede7a3

fix eof bug, expand error messages
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 20 Jul 2023 10:32:55 +0100
parents 75f1d3bc60d9
children
comparison
equal deleted inserted replaced
66:75f1d3bc60d9 67:b8d4a5ede7a3
29 bufView=memoryview(buf) 29 bufView=memoryview(buf)
30 fpos=bl=stream.readinto(buf) 30 fpos=bl=stream.readinto(buf)
31 bp=0 31 bp=0
32 done=False 32 done=False
33 while True: 33 while True:
34 while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty 34 while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty
35 bp+=2 35 bp+=2
36 start_1=bp 36 start_1=bp
37 if not buf.startswith(b'WARC/1.0\r\n',bp): 37 if not buf.startswith(b'WARC/1.0\r\n',bp):
38 if done and bl-bp==0: 38 if done and bl-bp==0:
39 # really done 39 # really done
40 return 40 return
41 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, 41 raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename,
42 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) 42 bp,bl,fpos,
43 (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'),
44 bl-bp))
43 bp+=10 45 bp+=10
44 wtype=None 46 wtype=None
45 length=None 47 length=None
46 state=1 48 state=1
47 tr=None # Was this record truncated? 49 tr=None # Was this record truncated?
64 elif buf.startswith(b'm',bp+11): 66 elif buf.startswith(b'm',bp+11):
65 wtype = META 67 wtype = META
66 elif buf.startswith(b'w',bp+11): 68 elif buf.startswith(b'w',bp+11):
67 wtype = INFO 69 wtype = INFO
68 else: 70 else:
69 raise ValueError("Unknown WARC-Type: %s at %s"%( 71 raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
70 bytes(bufView[bp+11:eol-2]), 72 bytes(bufView[bp+11:eol-2]),filename,
71 fpos-(bl-bp))) 73 fpos-(bl-bp)))
72 bp=eol 74 bp=eol
73 bp=eol+2 75 bp=eol+2
74 if done: 76 if done:
75 if (bp+length)>bl: 77 if (bp+length)>bl:
76 raise ValueError("Done but need more! %s + %s > %s"%(bp, 78 raise ValueError("Done but need more! %s + %s > %s in %s"%(bp,
77 length,bl)) 79 length,bl,filename))
78 elif (bp+(length+hdrMax))>bl: 80 elif (bp+(length+hdrMax))>bl:
79 # Need more data 81 # Need more data
80 if wtype in types: 82 if wtype in types:
81 # we need to keep from start_1 to bl 83 # we need to keep from start_1 to bl
82 keepFrom=start_1 84 keepFrom=start_1
161 state=1 163 state=1
162 164
163 L_start=rec_text.tell() 165 L_start=rec_text.tell()
164 bp+=length 166 bp+=length
165 #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) 167 #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)
166 #while not buf.startswith(b'\r\n',bp):
167 continue