Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 50:55943918794e
a bit better
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 13:39:23 +0100 |
parents | 699ef141af10 |
children | c0b4359dd26a |
comparison
equal
deleted
inserted
replaced
49:699ef141af10 | 50:55943918794e |
---|---|
15 stream=open(filename,'rb',0) | 15 stream=open(filename,'rb',0) |
16 bufSize=2*1024*1024 | 16 bufSize=2*1024*1024 |
17 hdrMax=16*1024 | 17 hdrMax=16*1024 |
18 buf=bytearray(bufSize) | 18 buf=bytearray(bufSize) |
19 hdrBuf=memoryview(buf)[:hdrMax] | 19 hdrBuf=memoryview(buf)[:hdrMax] |
20 while not stream.closed: | 20 fpos=0 |
21 bl=stream.readinto(hdrBuf) | |
22 while True: | |
21 bp=0 | 23 bp=0 |
22 fpos=stream.tell() | |
23 bl=stream.readinto(hdrBuf) | |
24 if bl==0: | |
25 break | |
26 while buf.startswith(b'\r\n',bp): | 24 while buf.startswith(b'\r\n',bp): |
27 bp+=2 | 25 bp+=2 |
28 if not buf.startswith(b'WARC/1.0\r\n',bp): | 26 if not buf.startswith(b'WARC/1.0\r\n',bp): |
29 raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, | 27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(fpos, |
30 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | 28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) |
31 bob=bp # in case 1 or whole | 29 bob=bp # in case 1 or whole |
32 bp+=10 | 30 bp+=10 |
33 wtype=None | 31 wtype=None |
34 length=None | 32 length=None |
35 state=1 | 33 state=1 |
34 done=False | |
36 tr=None # Was this record truncated? | 35 tr=None # Was this record truncated? |
37 while not buf.startswith(b'\r\n',bp): | 36 while not buf.startswith(b'\r\n',bp): |
38 eol=buf.index(b'\r\n',bp)+2 | 37 eol=buf.index(b'\r\n',bp)+2 |
39 if buf.startswith(b"Content-Length: ",bp): | 38 if buf.startswith(b"Content-Length: ",bp): |
40 length=wl=int(buf[bp+16:eol-2]) | 39 length=wl=int(buf[bp+16:eol-2]) |
42 tr=l[bp+16:eol-2] | 41 tr=l[bp+16:eol-2] |
43 tr="EMPTY" if tr=="" else tr | 42 tr="EMPTY" if tr=="" else tr |
44 elif buf.startswith(b'WARC-Type: ',bp): | 43 elif buf.startswith(b'WARC-Type: ',bp): |
45 wtype = bytes(buf[bp+11:eol-2]) | 44 wtype = bytes(buf[bp+11:eol-2]) |
46 bp=eol | 45 bp=eol |
47 start_2=eol+2 | 46 start_2=bp=eol+2 |
48 # need to read more if bp+length>hdrMax | 47 # need to read more if bp+length>hdrMax |
49 if (wtype in types): | 48 if (wtype in types): |
50 if whole: | 49 if whole: |
51 pass # buf[bp:(bp:=bp+ln)]=l | 50 pass # buf[bp:(bp:=bp+ln)]=l |
52 elif (parts & 1): | 51 elif (parts & 1): |
53 callback(wtype,buf[bob:start_2],1) | 52 print('cb') |
54 if parts==1: | 53 OUT=callback(wtype,buf[bob:eol],1) |
55 stream.seek(fpos+(bp-bob)+length) | 54 sys.stdout.flush() |
56 continue | 55 if parts!=1: |
57 else: | 56 # everything from bv= goes here |
58 start_2=bp | 57 pass |
59 else: | 58 print(wtype,fpos,bp,bp-bob,length) |
60 print(fpos,bp,bp-bob,length) | 59 stream.seek(fpos:=fpos+(bp-bob)+length) |
61 stream.seek(fpos+(bp-bob)+length) | 60 print(fpos) |
62 continue | 61 if done: |
62 return | |
63 buf[0:hdrMax-fpos]=buf[fpos:hdrMax] | |
64 n=stream.readinto(memoryview(buf)[fpos:hdrMax]) | |
65 if n<hdrMax-fpos or n==0: | |
66 done=True | |
67 #while not buf.startswith(b'\r\n',bp): | |
68 OUT.write(b"=====\n") | |
69 OUT.write(buf[0:100]) | |
70 continue | |
71 return | |
63 bv=memoryview(buf)[start_2:start_2+length] | 72 bv=memoryview(buf)[start_2:start_2+length] |
64 ii=0 | 73 ii=0 |
65 while True and not stream.closed: | 74 while True and not stream.closed: |
66 if (i:=stream.readinto(bv))==0: | 75 if (i:=stream.readinto(bv))==0: |
67 break | 76 break |