Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 48:d0d2fd9830d6
starting on conversion to direct-querying of buffer
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 06 Jul 2023 13:27:33 +0100 |
parents | 44d3a4f4ea51 |
children | 699ef141af10 |
comparison
equal
deleted
inserted
replaced
47:b59f49909bda | 48:d0d2fd9830d6 |
---|---|
11 nb=0 | 11 nb=0 |
12 if filename.endswith(".gz"): | 12 if filename.endswith(".gz"): |
13 stream=igzip.IGzipFile(filename=filename) | 13 stream=igzip.IGzipFile(filename=filename) |
14 else: | 14 else: |
15 stream=open(filename,'rb',0) | 15 stream=open(filename,'rb',0) |
16 bufsize=2*1024*1024 | 16 bufSize=2*1024*1024 |
17 buf=bytearray(bufsize) | 17 hdrMax=16*1024 |
18 l=b'\r\n' | 18 buf=bytearray(bufSize) |
19 hdrBuf=memoryview(buf)[:hdrMax] | |
19 while not stream.closed: | 20 while not stream.closed: |
20 bp=0 | 21 bp=0 |
21 while l==b'\r\n': | 22 bl=stream.readinto(hdrBuf) |
22 l=stream.readline() | 23 if bl==0: |
23 nb+=(ln:=len(l)) | |
24 if ln==0: | |
25 break | 24 break |
26 if l!=b'WARC/1.0\r\n': | 25 while buf.startswith(b'\r\n',bp): |
27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), | 26 bp+=2 |
28 l.decode('latin-1'),len(l))) | 27 if not buf.startswith(b'WARC/1.0\r\n',bp): |
28 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, | |
29 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | |
30 bob=bp # in case 1 or whole | |
31 bp+=10 | |
29 wtype=None | 32 wtype=None |
30 length=None | 33 length=None |
31 state=1 | 34 state=1 |
32 tr=None # Was this record truncated? | 35 tr=None # Was this record truncated? |
33 while l!=b'\r\n': | 36 while not buf.startswith(b'\r\n',bp): |
34 # WARC header | 37 print('yes',) |
35 if parts & 1: | 38 eol=buf.index(b'\r\n',bp)+2 |
36 buf[bp:(bp:=bp+ln)]=l | 39 if buf.startswith(b"Content-Length: ",bp): |
37 l=stream.readline() | 40 length=wl=int(buf[bp+16:eol-2]) |
38 nb+=(ln:=len(l)) | 41 elif buf.startswith(b"WARC-Truncated: ",bp): |
39 if l.startswith(b"Content-Length: "): | 42 tr=l[bp+16:eol-2] |
40 length=wl=int(l[16:].rstrip()) | |
41 elif l.startswith(b"WARC-Truncated: "): | |
42 tr=l[16:].rstrip() | |
43 tr="EMPTY" if tr=="" else tr | 43 tr="EMPTY" if tr=="" else tr |
44 elif l.startswith(b'WARC-Type: '): | 44 elif buf.startswith(b'WARC-Type: ',bp): |
45 wtype = l[11:-2] | 45 wtype = bytes(buf[bp+11:eol-2]) |
46 start_2=bp | 46 bp=eol |
47 start_2=eol | |
47 if (wtype in types): | 48 if (wtype in types): |
48 if whole: | 49 if whole: |
49 buf[bp:(bp:=bp+ln)]=l | 50 buf[bp:(bp:=bp+ln)]=l |
50 elif (parts & 1): | 51 elif (parts & 1): |
51 callback(wtype,buf[:start_2],1) | 52 callback(wtype,buf[:start_2],1) |