comparison bin/warc.py @ 53:0dc144bd027c

made 1 mean 1, still losing after a while
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 07 Jul 2023 19:04:16 +0100
parents c0b4359dd26a
children 9c63039a9b6d
comparison
equal deleted inserted replaced
52:e20c64917805 53:0dc144bd027c
16 bufSize=2*1024*1024 16 bufSize=2*1024*1024
17 hdrMax=16*1024 17 hdrMax=16*1024
18 buf=bytearray(bufSize) 18 buf=bytearray(bufSize)
19 hdrBuf=memoryview(buf)[:hdrMax] 19 hdrBuf=memoryview(buf)[:hdrMax]
20 fpos=bl=stream.readinto(hdrBuf) 20 fpos=bl=stream.readinto(hdrBuf)
21 bob=0
22 while True: 21 while True:
23 bp=0 22 bp=0
24 while buf.startswith(b'\r\n',bp): 23 while buf.startswith(b'\r\n',bp):
25 bp+=2 24 bp+=2
25 bob=bp
26 if not buf.startswith(b'WARC/1.0\r\n',bp): 26 if not buf.startswith(b'WARC/1.0\r\n',bp):
27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, 27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) 28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
29 bp+=10 29 bp+=10
30 wtype=None 30 wtype=None
46 if (wtype in types): 46 if (wtype in types):
47 if whole: 47 if whole:
48 pass # buf[bp:(bp:=bp+ln)]=l 48 pass # buf[bp:(bp:=bp+ln)]=l
49 elif (parts & 1): 49 elif (parts & 1):
50 print('cb') 50 print('cb')
51 OUT=callback(wtype,buf[bp:eol+length],1) 51 OUT=callback(wtype,buf[bob:eol],1)
52 if parts!=1: 52 if parts!=1:
53 # everything from bv= goes here 53 # everything from bv= goes here
54 # need to read more if eol+length>hdrMax 54 # need to read more if eol+length>hdrMax
55 pass 55 pass
56 print(wtype,bob,eol,length) 56 print(wtype,bob,eol,length)