Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 67:b8d4a5ede7a3
fix eof bug, expand error messages
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 20 Jul 2023 10:32:55 +0100 |
parents | 75f1d3bc60d9 |
children |
comparison
equal
deleted
inserted
replaced
66:75f1d3bc60d9 | 67:b8d4a5ede7a3 |
---|---|
29 bufView=memoryview(buf) | 29 bufView=memoryview(buf) |
30 fpos=bl=stream.readinto(buf) | 30 fpos=bl=stream.readinto(buf) |
31 bp=0 | 31 bp=0 |
32 done=False | 32 done=False |
33 while True: | 33 while True: |
34 while buf.startswith(b'\r\n',bp): # will Fail if buffer (nearly) empty | 34 while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty |
35 bp+=2 | 35 bp+=2 |
36 start_1=bp | 36 start_1=bp |
37 if not buf.startswith(b'WARC/1.0\r\n',bp): | 37 if not buf.startswith(b'WARC/1.0\r\n',bp): |
38 if done and bl-bp==0: | 38 if done and bl-bp==0: |
39 # really done | 39 # really done |
40 return | 40 return |
41 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, | 41 raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, |
42 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | 42 bp,bl,fpos, |
43 (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), | |
44 bl-bp)) | |
43 bp+=10 | 45 bp+=10 |
44 wtype=None | 46 wtype=None |
45 length=None | 47 length=None |
46 state=1 | 48 state=1 |
47 tr=None # Was this record truncated? | 49 tr=None # Was this record truncated? |
64 elif buf.startswith(b'm',bp+11): | 66 elif buf.startswith(b'm',bp+11): |
65 wtype = META | 67 wtype = META |
66 elif buf.startswith(b'w',bp+11): | 68 elif buf.startswith(b'w',bp+11): |
67 wtype = INFO | 69 wtype = INFO |
68 else: | 70 else: |
69 raise ValueError("Unknown WARC-Type: %s at %s"%( | 71 raise ValueError("Unknown WARC-Type: %s in %s at %s"%( |
70 bytes(bufView[bp+11:eol-2]), | 72 bytes(bufView[bp+11:eol-2]),filename, |
71 fpos-(bl-bp))) | 73 fpos-(bl-bp))) |
72 bp=eol | 74 bp=eol |
73 bp=eol+2 | 75 bp=eol+2 |
74 if done: | 76 if done: |
75 if (bp+length)>bl: | 77 if (bp+length)>bl: |
76 raise ValueError("Done but need more! %s + %s > %s"%(bp, | 78 raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, |
77 length,bl)) | 79 length,bl,filename)) |
78 elif (bp+(length+hdrMax))>bl: | 80 elif (bp+(length+hdrMax))>bl: |
79 # Need more data | 81 # Need more data |
80 if wtype in types: | 82 if wtype in types: |
81 # we need to keep from start_1 to bl | 83 # we need to keep from start_1 to bl |
82 keepFrom=start_1 | 84 keepFrom=start_1 |
161 state=1 | 163 state=1 |
162 | 164 |
163 L_start=rec_text.tell() | 165 L_start=rec_text.tell() |
164 bp+=length | 166 bp+=length |
165 #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) | 167 #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr) |
166 #while not buf.startswith(b'\r\n',bp): | |
167 continue |