# HG changeset patch # User Henry S. Thompson # Date 1619457509 0 # Node ID 6467024cd0723c8f130e46a3c862a588c713e7c7 # Parent 1d6fde73789d31fce648adea50cf221e259b6926 all parts working, idempotency achieved diff -r 1d6fde73789d -r 6467024cd072 bin/ix.py --- a/bin/ix.py Mon Apr 26 17:17:58 2021 +0000 +++ b/bin/ix.py Mon Apr 26 17:18:29 2021 +0000 @@ -62,57 +62,47 @@ BINOUT.write(cb) return # only parts wanted - # Note that _unlike the above_ this strips the ^M from the output lines - # so we are _not_ idempotent state=0 tr=None - with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', - newline='\r\n') as clear_text: + with io.BytesIO(cb) as clear_text: for L in clear_text: if state==0: # WARC header - if L.startswith("Content-Length: "): + if L.startswith(b"Content-Length: "): wl=int(L[16:].rstrip()) - elif L.startswith("WARC-Truncated: "): + elif L.startswith(b"WARC-Truncated: "): tr=L[16:].rstrip() tr="EMPTY" if tr=="" else tr - elif L.startswith("\r"): # make us idempotent + elif L=='' or L.startswith(b"\r"): # for idempotency if not (options.headers or options.body): return state=1 bl=None - if options.warc: - # preserve the empty line - print() - continue + # Note we preserve the empty line if options.warc: - print(L.rstrip()) + BINOUT.write(L) continue if state==1: # HTTP header wl -= len(L) - if L.startswith("Content-Length: "): + if L.startswith(b"Content-Length: "): bl=int(L[16:].rstrip()) - elif L=="" or L.startswith("\r"): + elif L==b"" or L.startswith(b"\r"): if not options.body: return state=2 - if options.headers: - # preserve the empty line - print() if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) - continue + # HTTP body + if options.body: + balance=clear_text.tell() + # Go this line with whatever is left in the buffer... + BINOUT.write(cb[balance-2:]) + return if options.headers: - print(L.rstrip()) - continue - # HTTP body - if options.body: - sys.stdout.flush() - BINOUT.write(cb[clear_text.tell():]) - return + BINOUT.write(L) def main(): parser = argparse.ArgumentParser(