Mercurial > hg > cc > cirrus_home
changeset 138:9ea12f7b304b
just barely working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 23 Jul 2021 16:23:46 +0000 |
parents | bb0153be65b5 |
children | e96d444b0f84 |
files | bin/warc.py |
diffstat | 1 files changed, 117 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/warc.py Fri Jul 23 16:23:46 2021 +0000 @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +'''Stream a warc format file, invoking a callback on each part. +Callback can be limited by WARC-Type''' +import sys + +def warc(callback,types=['response']): + nb=0 + stream=open(sys.stdin.fileno(),'rb',0) + bufsize=128*1024*1024 + buf=bytearray(128*1024*1024) + l=b'\r\n' + while True: + while l==b'\r\n': + l=stream.readline() + nb+=len(l) + if l!=b'WARC/1.0\r\n': + if l==0: + return + raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), + l.decode('latin-1'),len(l))) + wtype=None + length=None + while l!=b'\r\n': + l=stream.readline() + nb+=len(l) + if l.startswith(b'WARC-Type: '): + wtype = l[11:-2] + elif l.startswith(b'Content-Length: '): + length = int(l[16:]) + bv=memoryview(buf)[:length] + ii=0 + while True: + i=stream.readinto(bv) + ii+=i + if ii>=length: + break + bv=memoryview(buf)[ii:length] + if ii!=length: + raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) + nb+=length + if wtype in types: + callback(wtype,bv) + if whole and options.zipped: + _output(bv) + return + gzip_chunk = io.BytesIO(bv) + uv=memoryview(buf)[length:] + with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: + ll=0 + while True: + l=gzip_fin.readinto(uv) + if not l: + break + ll+=l + cb=memoryview(uv)[:ll] + if whole: + _output(cb) + return + # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted + state=0 + tr=None # Was this record truncated? + bl=None # for HTTP Content-Length for the length of the body? + with io.BytesIO(cb) as clear_text: + for L in clear_text: + if state==0: + # WARC header + if L.startswith(b"Content-Length: "): + wl=int(L[16:].rstrip()) + elif L.startswith(b"WARC-Truncated: "): + tr=L[16:].rstrip() + tr="EMPTY" if tr=="" else tr + elif L==b"" or L.startswith(b"\r"): # for idempotency + # Blank line, WARC header is finished + if not (options.headers or options.body): + return + state=1 + # Note we preserve the empty line + if options.warc: + _output(L) + continue + if state==1: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-blank, it's a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + if options.headers: + _output(L) + else: + # Blank line, HTTP header is finished + if not options.body: + return + if options.headers: + _output(L) + state=2 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=clear_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + _output(cb[balance:balance+wl]) + return + +OUT=open(sys.stdout.fileno(),'wb') + +def showme(wtype,buf): + OUT.write(buf) + +warc(showme,[b'metadata'])