Mercurial > hg > cc > cirrus_work
changeset 62:11cbaee8bbc8
Test 2 works with parts=1,2,3.
Tests 3 and 4 work;
Test 1 works with parts=1, gives correct output for warcinfo and metadata with parts=1,2,3.
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 14 Jul 2023 17:38:54 +0100 |
parents | f182d09ad1cd |
children | 9837840f3328 |
files | bin/warc.py |
diffstat | 1 files changed, 52 insertions(+), 54 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Fri Jul 14 12:08:09 2023 +0100 +++ b/bin/warc.py Fri Jul 14 17:38:54 2023 +0100 @@ -6,11 +6,17 @@ import sys,io from isal import igzip +RESP = b'response' +REQ = b'request' +META = b'metadata' +INFO = b'warcinfo' + def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): '''parts is a bit-mask: 1 for warc header; 2 for req/resp HTTP header, warcinfo/metadata features; 4 for req/resp body''' + # should do some sanity checking wrt parts and types types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 if filename.endswith(".gz"): @@ -52,13 +58,13 @@ tr=bytes(bufView[bp+16:eol-2]) elif buf.startswith(b'WARC-Type: ',bp): if buf.startswith(b's',bp+13): - wtype = b'response' + wtype = RESP elif buf.startswith(b'q',bp+13): - wtype = b'request' + wtype = REQ elif buf.startswith(b'm',bp+11): - wtype = b'metadata' + wtype = META elif buf.startswith(b'w',bp+11): - wtype = b'warcinfo' + wtype = INFO else: raise ValueError("Unknown WARC-Type: %s at %s"%( bytes(bufView[bp+11:eol-2]), @@ -101,6 +107,7 @@ if wtype not in types: continue if (wtype in types): + # Output whole or part 1 as required if whole: bp+=length OUT=callback(wtype,bufView[start_1:bp],7) @@ -108,56 +115,47 @@ elif (parts & 1): OUT=callback(wtype,bufView[start_1:eol],1) if parts!=1: - bv=bufView[start_2:start_2+length] - ii=0 - while True and not stream.closed: - if (i:=stream.readinto(bv))==0: - break - ii+=i - if ii>=length: - break - bv=memoryview(buf)[start_2+ii:start_2+length] - if ii!=length: - raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) - nb+=length - if wtype in types: - if whole: - callback(wtype,bufView[0:start_2+length],7) - continue - # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted - bl=None # for HTTP Content-Length for the length of the body? - L_start=start_2 - state=2 - bv=memoryview(buf)[start_2:start_2+length] - with io.BytesIO(bv) as rec_text: - for L in rec_text: - if state==2: - # HTTP header - wl -= len(L) - if not (L==b"" or L.startswith(b"\r")): - # Non-empty, it's (a continuation of) a header - if bl is None and L.startswith(b"Content-Length: "): - bl=int(L[16:].rstrip()) - else: - # Blank line, HTTP header is finished - if parts & 2: - callback(wtype,bufView[start_2:start_2+L_start],2) - state=4 - # The above is just for sanity, because we do _not_ - # continue with the outer loop, - # since we can now block-output the entire rest of the - # input buffer. - if bl is not None: - if bl!=wl: - print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) - # HTTP body - balance=start_2+rec_text.tell() - #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) - # Output whatever is left - if parts & 4: - callback(wtype,bufView[balance:balance+wl],4) - state=1 + while buf.startswith(b'\r\n',bp): + bp+=2 + start_2=bp + eob=bp+length + while buf.startswith(b'\r\n',eob-2): + eob-=2 + bv=bufView[start_2:eob] + # Only output parts (2 = HTTP header, 4 = body) that are wanted + if parts & 2: + if wtype is META or wtype is INFO: + # rest of the part + OUT=callback(wtype,bv,2) + if parts & 4: + for L in rec_text: + if state==2: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-empty, it's (a continuation of) a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + else: + # Blank line, HTTP header is finished + if parts & 2: + callback(wtype,bufView[start_2:start_2+L_start],2) + state=4 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=start_2+rec_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + if parts & 4: + callback(wtype,bufView[balance:balance+wl],4) + state=1 L_start=rec_text.tell() bp+=length