# HG changeset patch # User Henry S. Thompson # Date 1623780274 0 # Node ID 63898fde9751fedba2e51899cccf4a19b8ddf1d3 # Parent 3314f46a782f7b861a894b8173c3f083e0747232 refactor final processing loop, change basis for body dump, maybe too aggressive... diff -r 3314f46a782f -r 63898fde9751 bin/ix.py --- a/bin/ix.py Tue Jun 15 16:58:31 2021 +0000 +++ b/bin/ix.py Tue Jun 15 18:04:34 2021 +0000 @@ -27,6 +27,7 @@ def process(options,buf,root,filename,offset,length,whole): rfn=root+filename if root!="/beegfs/common_crawl": + # Support using ramdisk or other local disk as a faster cached if not os.path.exists(rfn): if not os.path.exists(os.path.dirname(rfn)): os.makedirs(os.path.dirname(rfn)) @@ -61,9 +62,10 @@ if whole: BINOUT.write(cb) return - # only parts wanted + # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted state=0 - tr=None + tr=None # Was this record truncated? + bl=None # for HTTP Content-Length for the length of the body? with io.BytesIO(cb) as clear_text: for L in clear_text: if state==0: @@ -73,11 +75,11 @@ elif L.startswith(b"WARC-Truncated: "): tr=L[16:].rstrip() tr="EMPTY" if tr=="" else tr - elif L=='' or L.startswith(b"\r"): # for idempotency + elif L==b"" or L.startswith(b"\r"): # for idempotency + # Blank line, WARC header is finished if not (options.headers or options.body): return state=1 - bl=None # Note we preserve the empty line if options.warc: BINOUT.write(L) @@ -85,24 +87,33 @@ if state==1: # HTTP header wl -= len(L) - if L.startswith(b"Content-Length: "): - bl=int(L[16:].rstrip()) - elif L==b"" or L.startswith(b"\r"): + if not (L==b"" or L.startswith(b"\r")): + # Non-blank, it's a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + if options.headers: + BINOUT.write(L) + else: + # Blank line, HTTP header is finished if not options.body: return + if options.headers: + BINOUT.write(L) state=2 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) # HTTP body - if options.body: - balance=clear_text.tell() - # Go this line with whatever is left in the buffer... - BINOUT.write(cb[balance-2:]) + balance=clear_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + BINOUT.write(cb[balance:balance+wl]) return - if options.headers: - BINOUT.write(L) def main(): parser = argparse.ArgumentParser(