# HG changeset patch # User Henry S. Thompson # Date 1619450903 0 # Node ID 3119bca711814d6abfa017d9cd4ad5cca31cba5d # Parent f148c2366faa686a624c71e2dc05cabe3b3e30db warc and headers parts working diff -r f148c2366faa -r 3119bca71181 bin/ix.py --- a/bin/ix.py Thu Apr 22 21:31:03 2021 +0000 +++ b/bin/ix.py Mon Apr 26 15:28:23 2021 +0000 @@ -39,26 +39,80 @@ break outfile.write(memoryview(buf)[:l]) file=open(rfn,'rb',0) - if whole: - # try external unzip using Popen - file.seek(offset) - bv=memoryview(buf)[:length] - nb=file.readinto(bv) - file.close() - if nb!=length: - print("losing",file.name,length,nb,file=sys.stderr) - if options.zipped: - BINOUT.write(bv) - else: - gzip_chunk = io.BytesIO(bv) - uv=memoryview(buf)[length:] - #clear_bytes=io.BytesIO(uv) - with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: - while True: - l=gzip_fin.readinto(uv) - if not l: - break - BINOUT.write(memoryview(uv)[:l]) + file.seek(offset) + bv=memoryview(buf)[:length] + nb=file.readinto(bv) + file.close() + if nb!=length: + print("losing",file.name,length,nb,file=sys.stderr) + if whole and options.zipped: + BINOUT.write(bv) + return + gzip_chunk = io.BytesIO(bv) + uv=memoryview(buf)[length:] + with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: + ll=0 + while True: + l=gzip_fin.readinto(uv) + if not l: + break + ll+=l + cb=memoryview(uv)[:ll] + if whole: + BINOUT.write(cb) + return + # only parts wanted + # Note that _unlike the above_ this strips the ^M from the output lines + # so we are _not_ idempotent + state=0 + tr=None + with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', + newline='\r\n') as clear_text: + for L in clear_text: + if state==0: + # WARC header + if L.startswith("Content-Length: "): + wl=int(L[16:].rstrip()) + elif L.startswith("WARC-Truncated: "): + tr=L[16:].rstrip() + tr="EMPTY" if tr=="" else tr + elif L.startswith("\r"): # make us idempotent + if not (options.headers or options.body): + return + state=1 + bl=None + if options.warc: + # preserve the empty line + print() + continue + if options.warc: + print(L.rstrip()) + continue + if state==1: + # HTTP header + wl -= len(L) + if L.startswith("Content-Length: "): + bl=int(L[16:].rstrip()) + elif L=="" or L.startswith("\r"): + if not options.body: + return + state=2 + if options.headers: + # preserve the empty line + print() + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + continue + if options.headers: + print(L.rstrip()) + continue + # HTTP body + if options.body: + sys.stdout.flush() + BINOUT.write(cb[clear_text.tell():]) + return def main(): parser = argparse.ArgumentParser(