Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 49:699ef141af10
just barely working for 1, need to rethink buffering
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 06 Jul 2023 14:53:28 +0100 |
parents | d0d2fd9830d6 |
children | 55943918794e |
comparison
equal
deleted
inserted
replaced
48:d0d2fd9830d6 | 49:699ef141af10 |
---|---|
17 hdrMax=16*1024 | 17 hdrMax=16*1024 |
18 buf=bytearray(bufSize) | 18 buf=bytearray(bufSize) |
19 hdrBuf=memoryview(buf)[:hdrMax] | 19 hdrBuf=memoryview(buf)[:hdrMax] |
20 while not stream.closed: | 20 while not stream.closed: |
21 bp=0 | 21 bp=0 |
22 fpos=stream.tell() | |
22 bl=stream.readinto(hdrBuf) | 23 bl=stream.readinto(hdrBuf) |
23 if bl==0: | 24 if bl==0: |
24 break | 25 break |
25 while buf.startswith(b'\r\n',bp): | 26 while buf.startswith(b'\r\n',bp): |
26 bp+=2 | 27 bp+=2 |
27 if not buf.startswith(b'WARC/1.0\r\n',bp): | 28 if not buf.startswith(b'WARC/1.0\r\n',bp): |
28 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, | 29 raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, |
29 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | 30 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) |
30 bob=bp # in case 1 or whole | 31 bob=bp # in case 1 or whole |
31 bp+=10 | 32 bp+=10 |
32 wtype=None | 33 wtype=None |
33 length=None | 34 length=None |
34 state=1 | 35 state=1 |
35 tr=None # Was this record truncated? | 36 tr=None # Was this record truncated? |
36 while not buf.startswith(b'\r\n',bp): | 37 while not buf.startswith(b'\r\n',bp): |
37 print('yes',) | |
38 eol=buf.index(b'\r\n',bp)+2 | 38 eol=buf.index(b'\r\n',bp)+2 |
39 if buf.startswith(b"Content-Length: ",bp): | 39 if buf.startswith(b"Content-Length: ",bp): |
40 length=wl=int(buf[bp+16:eol-2]) | 40 length=wl=int(buf[bp+16:eol-2]) |
41 elif buf.startswith(b"WARC-Truncated: ",bp): | 41 elif buf.startswith(b"WARC-Truncated: ",bp): |
42 tr=l[bp+16:eol-2] | 42 tr=l[bp+16:eol-2] |
43 tr="EMPTY" if tr=="" else tr | 43 tr="EMPTY" if tr=="" else tr |
44 elif buf.startswith(b'WARC-Type: ',bp): | 44 elif buf.startswith(b'WARC-Type: ',bp): |
45 wtype = bytes(buf[bp+11:eol-2]) | 45 wtype = bytes(buf[bp+11:eol-2]) |
46 bp=eol | 46 bp=eol |
47 start_2=eol | 47 start_2=eol+2 |
48 # need to read more if bp+length>hdrMax | |
48 if (wtype in types): | 49 if (wtype in types): |
49 if whole: | 50 if whole: |
50 buf[bp:(bp:=bp+ln)]=l | 51 pass # buf[bp:(bp:=bp+ln)]=l |
51 elif (parts & 1): | 52 elif (parts & 1): |
52 callback(wtype,buf[:start_2],1) | 53 callback(wtype,buf[bob:start_2],1) |
53 if parts==1: | 54 if parts==1: |
54 start_2=0 | 55 stream.seek(fpos+(bp-bob)+length) |
56 continue | |
55 else: | 57 else: |
56 start_2=bp | 58 start_2=bp |
57 else: | 59 else: |
58 start_2=0 | 60 print(fpos,bp,bp-bob,length) |
61 stream.seek(fpos+(bp-bob)+length) | |
62 continue | |
59 bv=memoryview(buf)[start_2:start_2+length] | 63 bv=memoryview(buf)[start_2:start_2+length] |
60 ii=0 | 64 ii=0 |
61 while True and not stream.closed: | 65 while True and not stream.closed: |
62 if (i:=stream.readinto(bv))==0: | 66 if (i:=stream.readinto(bv))==0: |
63 break | 67 break |