Mercurial > hg > cc > cirrus_home
changeset 175:d123ef7fdb82
working on implementing types and parts:
1, 2, 4 working, 3 not
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 03 Jul 2023 18:16:14 +0100 |
parents | bfe9085a1d39 |
children | 97137f5bbe0f |
files | bin/warc.py |
diffstat | 1 files changed, 84 insertions(+), 88 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Tue Jan 10 17:48:26 2023 +0000 +++ b/bin/warc.py Mon Jul 03 18:16:14 2023 +0100 @@ -1,126 +1,122 @@ #!/usr/bin/env python3 -'''Stream a warc format file, invoking a callback on each part. -Callback can be limited by WARC-Type''' -import sys,os +'''Stream a warc format file, invoking a callback on each record. +Callback can be limited by WARC-Type, record part''' +import sys,os,io -def warc(callback,types=['response']): +if (debug:=(sys.argv[1]=='-d')): + sys.argv.pop(1) + +def warc(callback,types=['response'],parts=7): + types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 stream=open(sys.argv[1],'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' while True: + bp=0 while l==b'\r\n': l=stream.readline() - nb+=len(l) + nb+=(ln:=len(l)) if l!=b'WARC/1.0\r\n': - if l==b'': - return raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), l.decode('latin-1'),len(l))) wtype=None length=None + state=1 + tr=None # Was this record truncated? while l!=b'\r\n': + if parts & 1: + buf[bp:(bp:=bp+ln)]=l l=stream.readline() - nb+=len(l) - if l.startswith(b'WARC-Type: '): + nb+=(ln:=len(l)) + # WARC header + if l.startswith(b"Content-Length: "): + length=wl=int(l[16:].rstrip()) + elif l.startswith(b"WARC-Truncated: "): + tr=l[16:].rstrip() + tr="EMPTY" if tr=="" else tr + elif l.startswith(b'WARC-Type: '): wtype = l[11:-2] - elif l.startswith(b'Content-Length: '): - length = int(l[16:]) - bv=memoryview(buf)[:length] + start_2=bp + if (wtype in types) and (parts & 1): + if parts!=1: + buf[bp:(bp:=bp+ln)]=l + start_2=bp + if parts!=7: + callback(wtype,buf[:start_2],1) + else: + start_2=0 + bv=memoryview(buf)[start_2:start_2+length] ii=0 - while True: - i=stream.readinto(bv) + while True and not stream.closed: + if (i:=stream.readinto(bv))==0: + break ii+=i if ii>=length: break - bv=memoryview(buf)[ii:length] + bv=memoryview(buf)[start_2+ii:start_2+length] if ii!=length: raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) nb+=length + bv=memoryview(buf)[start_2:start_2+length] if wtype in types: - callback(wtype,memoryview(buf[:length])) - if whole and options.zipped: - _output(bv) - return - gzip_chunk = io.BytesIO(bv) - uv=memoryview(buf)[length:] - with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: - ll=0 - while True: - l=gzip_fin.readinto(uv) - if not l: - break - ll+=l - cb=memoryview(uv)[:ll] - if whole: - _output(cb) - return - # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted - state=0 - tr=None # Was this record truncated? - bl=None # for HTTP Content-Length for the length of the body? - with io.BytesIO(cb) as clear_text: - for L in clear_text: - if state==0: - # WARC header - if L.startswith(b"Content-Length: "): - wl=int(L[16:].rstrip()) - elif L.startswith(b"WARC-Truncated: "): - tr=L[16:].rstrip() - tr="EMPTY" if tr=="" else tr - elif L==b"" or L.startswith(b"\r"): # for idempotency - # Blank line, WARC header is finished - if not (options.headers or options.body): - return - state=1 - # Note we preserve the empty line - if options.warc: - _output(L) + if parts==7: + callback(wtype,memoryview(buf)[0:start_2+length],7) continue - if state==1: - # HTTP header - wl -= len(L) - if not (L==b"" or L.startswith(b"\r")): - # Non-blank, it's a header - if bl is None and L.startswith(b"Content-Length: "): - bl=int(L[16:].rstrip()) - if options.headers: - _output(L) - else: - # Blank line, HTTP header is finished - if not options.body: - return - if options.headers: - _output(L) - state=2 - # The above is just for sanity, because we do _not_ - # continue with the outer loop, - # since we can now block-output the entire rest of the - # input buffer. - if bl is not None: - if bl!=wl: - print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) - # HTTP body - balance=clear_text.tell() - #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) - # Output whatever is left - _output(cb[balance:balance+wl]) - return - + # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted + bl=None # for HTTP Content-Length for the length of the body? + L_start=0 + state=2 + with io.BytesIO(bv) as rec_text: + for L in rec_text: + if state==2: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-empty, it's (a continuation of) a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + else: + # Blank line, HTTP header is finished + if parts & 2: + callback(wtype,bv[start_2:L_start],2) + state=4 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=rec_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + if parts & 4: + callback(wtype,bv[balance:balance+wl],4) + state=1 + + L_start=rec_text.tell() OUT=open(sys.stdout.fileno(),'wb') import re LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) -def showmeLMH(wtype,buf): +def showmeLMH(wtype,buf,part=2): m=LMPAT.search(buf.tobytes(order='A')) if m: OUT.write(m[1]) OUT.write(b'\n') -def showme(wtype,buf): - OUT.write(buf) +def showme(wtype,buf,part): + if debug: + breakpoint() + OUT.write(b"%d\n%b"%(part,buf)) -warc(showmeLMH,[b'response']) +#warc(showmeLMH,[b'response'],2) + +#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) + +warc(showme,[b'response'],int(sys.argv[2]))