Mercurial > hg > cc > cirrus_home
view bin/warc.py @ 143:ddff993994be
too clever by half, keys won't work in parallel for e.g. media types
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Oct 2021 15:47:55 +0000 |
parents | e96d444b0f84 |
children | d123ef7fdb82 |
line wrap: on
line source
#!/usr/bin/env python3 '''Stream a warc format file, invoking a callback on each part. Callback can be limited by WARC-Type''' import sys,os def warc(callback,types=['response']): nb=0 stream=open(sys.argv[1],'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' while True: while l==b'\r\n': l=stream.readline() nb+=len(l) if l!=b'WARC/1.0\r\n': if l==b'': return raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), l.decode('latin-1'),len(l))) wtype=None length=None while l!=b'\r\n': l=stream.readline() nb+=len(l) if l.startswith(b'WARC-Type: '): wtype = l[11:-2] elif l.startswith(b'Content-Length: '): length = int(l[16:]) bv=memoryview(buf)[:length] ii=0 while True: i=stream.readinto(bv) ii+=i if ii>=length: break bv=memoryview(buf)[ii:length] if ii!=length: raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) nb+=length if wtype in types: callback(wtype,memoryview(buf[:length])) if whole and options.zipped: _output(bv) return gzip_chunk = io.BytesIO(bv) uv=memoryview(buf)[length:] with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: ll=0 while True: l=gzip_fin.readinto(uv) if not l: break ll+=l cb=memoryview(uv)[:ll] if whole: _output(cb) return # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted state=0 tr=None # Was this record truncated? bl=None # for HTTP Content-Length for the length of the body? with io.BytesIO(cb) as clear_text: for L in clear_text: if state==0: # WARC header if L.startswith(b"Content-Length: "): wl=int(L[16:].rstrip()) elif L.startswith(b"WARC-Truncated: "): tr=L[16:].rstrip() tr="EMPTY" if tr=="" else tr elif L==b"" or L.startswith(b"\r"): # for idempotency # Blank line, WARC header is finished if not (options.headers or options.body): return state=1 # Note we preserve the empty line if options.warc: _output(L) continue if state==1: # HTTP header wl -= len(L) if not (L==b"" or L.startswith(b"\r")): # Non-blank, it's a header if bl is None and L.startswith(b"Content-Length: "): bl=int(L[16:].rstrip()) if options.headers: _output(L) else: # Blank line, HTTP header is finished if not options.body: return if options.headers: _output(L) state=2 # The above is just for sanity, because we do _not_ # continue with the outer loop, # since we can now block-output the entire rest of the # input buffer. if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) # HTTP body balance=clear_text.tell() #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) # Output whatever is left _output(cb[balance:balance+wl]) return OUT=open(sys.stdout.fileno(),'wb') import re LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) def showmeLMH(wtype,buf): m=LMPAT.search(buf.tobytes(order='A')) if m: OUT.write(m[1]) OUT.write(b'\n') def showme(wtype,buf): OUT.write(buf) warc(showmeLMH,[b'response'])