Mercurial > hg > cc > cirrus_home
changeset 114:6467024cd072
all parts working, idempotency achieved
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 26 Apr 2021 17:18:29 +0000 |
parents | 1d6fde73789d |
children | 2bcf31c52c14 |
files | bin/ix.py |
diffstat | 1 files changed, 15 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Mon Apr 26 17:17:58 2021 +0000 +++ b/bin/ix.py Mon Apr 26 17:18:29 2021 +0000 @@ -62,57 +62,47 @@ BINOUT.write(cb) return # only parts wanted - # Note that _unlike the above_ this strips the ^M from the output lines - # so we are _not_ idempotent state=0 tr=None - with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', - newline='\r\n') as clear_text: + with io.BytesIO(cb) as clear_text: for L in clear_text: if state==0: # WARC header - if L.startswith("Content-Length: "): + if L.startswith(b"Content-Length: "): wl=int(L[16:].rstrip()) - elif L.startswith("WARC-Truncated: "): + elif L.startswith(b"WARC-Truncated: "): tr=L[16:].rstrip() tr="EMPTY" if tr=="" else tr - elif L.startswith("\r"): # make us idempotent + elif L=='' or L.startswith(b"\r"): # for idempotency if not (options.headers or options.body): return state=1 bl=None - if options.warc: - # preserve the empty line - print() - continue + # Note we preserve the empty line if options.warc: - print(L.rstrip()) + BINOUT.write(L) continue if state==1: # HTTP header wl -= len(L) - if L.startswith("Content-Length: "): + if L.startswith(b"Content-Length: "): bl=int(L[16:].rstrip()) - elif L=="" or L.startswith("\r"): + elif L==b"" or L.startswith(b"\r"): if not options.body: return state=2 - if options.headers: - # preserve the empty line - print() if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) - continue + # HTTP body + if options.body: + balance=clear_text.tell() + # Go this line with whatever is left in the buffer... + BINOUT.write(cb[balance-2:]) + return if options.headers: - print(L.rstrip()) - continue - # HTTP body - if options.body: - sys.stdout.flush() - BINOUT.write(cb[clear_text.tell():]) - return + BINOUT.write(L) def main(): parser = argparse.ArgumentParser(