comparison bin/warc.py @ 49:699ef141af10

just barely working for 1, need to rethink buffering
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 06 Jul 2023 14:53:28 +0100
parents d0d2fd9830d6
children 55943918794e
comparison
equal deleted inserted replaced
48:d0d2fd9830d6 49:699ef141af10
17 hdrMax=16*1024 17 hdrMax=16*1024
18 buf=bytearray(bufSize) 18 buf=bytearray(bufSize)
19 hdrBuf=memoryview(buf)[:hdrMax] 19 hdrBuf=memoryview(buf)[:hdrMax]
20 while not stream.closed: 20 while not stream.closed:
21 bp=0 21 bp=0
22 fpos=stream.tell()
22 bl=stream.readinto(hdrBuf) 23 bl=stream.readinto(hdrBuf)
23 if bl==0: 24 if bl==0:
24 break 25 break
25 while buf.startswith(b'\r\n',bp): 26 while buf.startswith(b'\r\n',bp):
26 bp+=2 27 bp+=2
27 if not buf.startswith(b'WARC/1.0\r\n',bp): 28 if not buf.startswith(b'WARC/1.0\r\n',bp):
28 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, 29 raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos,
29 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) 30 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
30 bob=bp # in case 1 or whole 31 bob=bp # in case 1 or whole
31 bp+=10 32 bp+=10
32 wtype=None 33 wtype=None
33 length=None 34 length=None
34 state=1 35 state=1
35 tr=None # Was this record truncated? 36 tr=None # Was this record truncated?
36 while not buf.startswith(b'\r\n',bp): 37 while not buf.startswith(b'\r\n',bp):
37 print('yes',)
38 eol=buf.index(b'\r\n',bp)+2 38 eol=buf.index(b'\r\n',bp)+2
39 if buf.startswith(b"Content-Length: ",bp): 39 if buf.startswith(b"Content-Length: ",bp):
40 length=wl=int(buf[bp+16:eol-2]) 40 length=wl=int(buf[bp+16:eol-2])
41 elif buf.startswith(b"WARC-Truncated: ",bp): 41 elif buf.startswith(b"WARC-Truncated: ",bp):
42 tr=l[bp+16:eol-2] 42 tr=l[bp+16:eol-2]
43 tr="EMPTY" if tr=="" else tr 43 tr="EMPTY" if tr=="" else tr
44 elif buf.startswith(b'WARC-Type: ',bp): 44 elif buf.startswith(b'WARC-Type: ',bp):
45 wtype = bytes(buf[bp+11:eol-2]) 45 wtype = bytes(buf[bp+11:eol-2])
46 bp=eol 46 bp=eol
47 start_2=eol 47 start_2=eol+2
48 # need to read more if bp+length>hdrMax
48 if (wtype in types): 49 if (wtype in types):
49 if whole: 50 if whole:
50 buf[bp:(bp:=bp+ln)]=l 51 pass # buf[bp:(bp:=bp+ln)]=l
51 elif (parts & 1): 52 elif (parts & 1):
52 callback(wtype,buf[:start_2],1) 53 callback(wtype,buf[bob:start_2],1)
53 if parts==1: 54 if parts==1:
54 start_2=0 55 stream.seek(fpos+(bp-bob)+length)
56 continue
55 else: 57 else:
56 start_2=bp 58 start_2=bp
57 else: 59 else:
58 start_2=0 60 print(fpos,bp,bp-bob,length)
61 stream.seek(fpos+(bp-bob)+length)
62 continue
59 bv=memoryview(buf)[start_2:start_2+length] 63 bv=memoryview(buf)[start_2:start_2+length]
60 ii=0 64 ii=0
61 while True and not stream.closed: 65 while True and not stream.closed:
62 if (i:=stream.readinto(bv))==0: 66 if (i:=stream.readinto(bv))==0:
63 break 67 break