Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 51:c0b4359dd26a
working better, gets confused by 3-part response
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 17:03:52 +0100 |
parents | 55943918794e |
children | 0dc144bd027c |
comparison
equal
deleted
inserted
replaced
50:55943918794e | 51:c0b4359dd26a |
---|---|
15 stream=open(filename,'rb',0) | 15 stream=open(filename,'rb',0) |
16 bufSize=2*1024*1024 | 16 bufSize=2*1024*1024 |
17 hdrMax=16*1024 | 17 hdrMax=16*1024 |
18 buf=bytearray(bufSize) | 18 buf=bytearray(bufSize) |
19 hdrBuf=memoryview(buf)[:hdrMax] | 19 hdrBuf=memoryview(buf)[:hdrMax] |
20 fpos=0 | 20 fpos=bl=stream.readinto(hdrBuf) |
21 bl=stream.readinto(hdrBuf) | 21 bob=0 |
22 while True: | 22 while True: |
23 bp=0 | 23 bp=0 |
24 while buf.startswith(b'\r\n',bp): | 24 while buf.startswith(b'\r\n',bp): |
25 bp+=2 | 25 bp+=2 |
26 if not buf.startswith(b'WARC/1.0\r\n',bp): | 26 if not buf.startswith(b'WARC/1.0\r\n',bp): |
27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, | 27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, |
28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | 28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) |
29 bob=bp # in case 1 or whole | |
30 bp+=10 | 29 bp+=10 |
31 wtype=None | 30 wtype=None |
32 length=None | 31 length=None |
33 state=1 | 32 state=1 |
34 done=False | 33 done=False |
41 tr=l[bp+16:eol-2] | 40 tr=l[bp+16:eol-2] |
42 tr="EMPTY" if tr=="" else tr | 41 tr="EMPTY" if tr=="" else tr |
43 elif buf.startswith(b'WARC-Type: ',bp): | 42 elif buf.startswith(b'WARC-Type: ',bp): |
44 wtype = bytes(buf[bp+11:eol-2]) | 43 wtype = bytes(buf[bp+11:eol-2]) |
45 bp=eol | 44 bp=eol |
46 start_2=bp=eol+2 | 45 bp=eol+2 |
47 # need to read more if bp+length>hdrMax | |
48 if (wtype in types): | 46 if (wtype in types): |
49 if whole: | 47 if whole: |
50 pass # buf[bp:(bp:=bp+ln)]=l | 48 pass # buf[bp:(bp:=bp+ln)]=l |
51 elif (parts & 1): | 49 elif (parts & 1): |
52 print('cb') | 50 print('cb') |
53 OUT=callback(wtype,buf[bob:eol],1) | 51 OUT=callback(wtype,buf[bp:eol+length],1) |
54 sys.stdout.flush() | |
55 if parts!=1: | 52 if parts!=1: |
56 # everything from bv= goes here | 53 # everything from bv= goes here |
54 # need to read more if eol+length>hdrMax | |
57 pass | 55 pass |
58 print(wtype,fpos,bp,bp-bob,length) | 56 print(wtype,bob,eol,length) |
59 stream.seek(fpos:=fpos+(bp-bob)+length) | 57 jumpTo=bp+length |
60 print(fpos) | 58 buf[0:hdrMax-jumpTo]=buf[jumpTo:hdrMax] |
59 stream.seek(fpos:=fpos+jumpTo) | |
61 if done: | 60 if done: |
62 return | 61 continue |
63 buf[0:hdrMax-fpos]=buf[fpos:hdrMax] | 62 n=stream.readinto(memoryview(buf)[hdrMax-jumpTo:hdrMax]) |
64 n=stream.readinto(memoryview(buf)[fpos:hdrMax]) | 63 print('read',n) |
65 if n<hdrMax-fpos or n==0: | 64 if n<jumpTo or n==0: |
65 print('done',n,jumpTo) | |
66 done=True | 66 done=True |
67 bp=0 | |
67 #while not buf.startswith(b'\r\n',bp): | 68 #while not buf.startswith(b'\r\n',bp): |
68 OUT.write(b"=====\n") | 69 OUT.write(b"=====\n") |
69 OUT.write(buf[0:100]) | 70 OUT.write(buf[0:100]) |
70 continue | 71 continue |
71 return | 72 return |