changeset 117:63898fde9751

refactor final processing loop, change basis for body dump, maybe too aggressive...
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 15 Jun 2021 18:04:34 +0000
parents 3314f46a782f
children 551ff1de13d8
files bin/ix.py
diffstat 1 files changed, 24 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Tue Jun 15 16:58:31 2021 +0000
+++ b/bin/ix.py	Tue Jun 15 18:04:34 2021 +0000
@@ -27,6 +27,7 @@
 def process(options,buf,root,filename,offset,length,whole):
   rfn=root+filename
   if root!="/beegfs/common_crawl":
+    # Support using ramdisk or other local disk as a faster cached
     if not os.path.exists(rfn):
       if not os.path.exists(os.path.dirname(rfn)):
         os.makedirs(os.path.dirname(rfn))
@@ -61,9 +62,10 @@
     if whole:
       BINOUT.write(cb)
       return
-  # only parts wanted
+  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
   state=0
-  tr=None
+  tr=None # Was this record truncated?
+  bl=None # for HTTP Content-Length for the length of the body?
   with io.BytesIO(cb) as clear_text:
     for L in clear_text:
       if state==0:
@@ -73,11 +75,11 @@
         elif L.startswith(b"WARC-Truncated: "):
           tr=L[16:].rstrip()
           tr="EMPTY" if tr=="" else tr
-        elif L=='' or L.startswith(b"\r"): # for idempotency
+        elif L==b"" or L.startswith(b"\r"): # for idempotency
+          # Blank line, WARC header is finished
           if not (options.headers or options.body):
             return
           state=1
-          bl=None
           # Note we preserve the empty line
         if options.warc:
           BINOUT.write(L)
@@ -85,24 +87,33 @@
       if state==1:
         # HTTP header
         wl -= len(L)
-        if L.startswith(b"Content-Length: "):
-          bl=int(L[16:].rstrip())
-        elif L==b"" or L.startswith(b"\r"):
+        if not (L==b"" or L.startswith(b"\r")):
+          # Non-blank, it's a header
+          if bl is None and L.startswith(b"Content-Length: "):
+            bl=int(L[16:].rstrip())
+          if options.headers:
+            BINOUT.write(L)
+        else:
+          # Blank line, HTTP header is finished
           if not options.body:
             return
+          if options.headers:
+            BINOUT.write(L)
           state=2
+          # The above is just for sanity, because we do _not_
+          #  continue with the outer loop,
+          #  since we can now block-output the entire rest of the
+          #  input buffer.
           if bl is not None:
             if bl!=wl:
               print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
                     (length,offset,filename,wl,bl,tr),file=sys.stderr)
           # HTTP body
-          if options.body:
-            balance=clear_text.tell() 
-            # Go this line with whatever is left in the buffer...
-            BINOUT.write(cb[balance-2:])
+          balance=clear_text.tell()
+          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
+          # Output whatever is left
+          BINOUT.write(cb[balance:balance+wl])
           return
-        if options.headers:
-          BINOUT.write(L)
 
 def main():
   parser = argparse.ArgumentParser(