changeset 114:6467024cd072

all parts working, idempotency achieved
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 26 Apr 2021 17:18:29 +0000
parents 1d6fde73789d
children 2bcf31c52c14
files bin/ix.py
diffstat 1 files changed, 15 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Mon Apr 26 17:17:58 2021 +0000
+++ b/bin/ix.py	Mon Apr 26 17:18:29 2021 +0000
@@ -62,57 +62,47 @@
       BINOUT.write(cb)
       return
   # only parts wanted
-  # Note that _unlike the above_ this strips the ^M from the output lines
-  #  so we are _not_ idempotent
   state=0
   tr=None
-  with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1',
-                        newline='\r\n') as clear_text:
+  with io.BytesIO(cb) as clear_text:
     for L in clear_text:
       if state==0:
         # WARC header
-        if L.startswith("Content-Length: "):
+        if L.startswith(b"Content-Length: "):
           wl=int(L[16:].rstrip())
-        elif L.startswith("WARC-Truncated: "):
+        elif L.startswith(b"WARC-Truncated: "):
           tr=L[16:].rstrip()
           tr="EMPTY" if tr=="" else tr
-        elif L.startswith("\r"): # make us idempotent
+        elif L=='' or L.startswith(b"\r"): # for idempotency
           if not (options.headers or options.body):
             return
           state=1
           bl=None
-          if options.warc:
-            # preserve the empty line
-            print()
-            continue
+          # Note we preserve the empty line
         if options.warc:
-          print(L.rstrip())
+          BINOUT.write(L)
         continue
       if state==1:
         # HTTP header
         wl -= len(L)
-        if L.startswith("Content-Length: "):
+        if L.startswith(b"Content-Length: "):
           bl=int(L[16:].rstrip())
-        elif L=="" or L.startswith("\r"):
+        elif L==b"" or L.startswith(b"\r"):
           if not options.body:
             return
           state=2
-          if options.headers:
-            # preserve the empty line
-            print()
           if bl is not None:
             if bl!=wl:
               print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
                     (length,offset,filename,wl,bl,tr),file=sys.stderr)
-          continue
+          # HTTP body
+          if options.body:
+            balance=clear_text.tell() 
+            # Go this line with whatever is left in the buffer...
+            BINOUT.write(cb[balance-2:])
+          return
         if options.headers:
-          print(L.rstrip())
-        continue
-      # HTTP body
-      if options.body:
-        sys.stdout.flush()
-        BINOUT.write(cb[clear_text.tell():])
-      return
+          BINOUT.write(L)
 
 def main():
   parser = argparse.ArgumentParser(