changeset 111:3119bca71181

warc and headers parts working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 26 Apr 2021 15:28:23 +0000
parents f148c2366faa
children bcea0410143c
files bin/ix.py
diffstat 1 files changed, 74 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Thu Apr 22 21:31:03 2021 +0000
+++ b/bin/ix.py	Mon Apr 26 15:28:23 2021 +0000
@@ -39,26 +39,80 @@
             break
           outfile.write(memoryview(buf)[:l])
   file=open(rfn,'rb',0)
-  if whole:
-    # try external unzip using Popen
-    file.seek(offset)
-    bv=memoryview(buf)[:length]
-    nb=file.readinto(bv)
-    file.close()
-    if nb!=length:
-      print("losing",file.name,length,nb,file=sys.stderr)
-    if options.zipped:
-      BINOUT.write(bv)
-    else:
-      gzip_chunk = io.BytesIO(bv)
-      uv=memoryview(buf)[length:]
-      #clear_bytes=io.BytesIO(uv)
-      with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
-        while True:
-          l=gzip_fin.readinto(uv)
-          if not l:
-            break
-          BINOUT.write(memoryview(uv)[:l])
+  file.seek(offset)
+  bv=memoryview(buf)[:length]
+  nb=file.readinto(bv)
+  file.close()
+  if nb!=length:
+    print("losing",file.name,length,nb,file=sys.stderr)
+  if whole and options.zipped:
+    BINOUT.write(bv)
+    return
+  gzip_chunk = io.BytesIO(bv)
+  uv=memoryview(buf)[length:]
+  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
+    ll=0
+    while True:
+      l=gzip_fin.readinto(uv)
+      if not l:
+        break
+      ll+=l
+    cb=memoryview(uv)[:ll]
+    if whole:
+      BINOUT.write(cb)
+      return
+  # only parts wanted
+  # Note that _unlike the above_ this strips the ^M from the output lines
+  #  so we are _not_ idempotent
+  state=0
+  tr=None
+  with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1',
+                        newline='\r\n') as clear_text:
+    for L in clear_text:
+      if state==0:
+        # WARC header
+        if L.startswith("Content-Length: "):
+          wl=int(L[16:].rstrip())
+        elif L.startswith("WARC-Truncated: "):
+          tr=L[16:].rstrip()
+          tr="EMPTY" if tr=="" else tr
+        elif L.startswith("\r"): # make us idempotent
+          if not (options.headers or options.body):
+            return
+          state=1
+          bl=None
+          if options.warc:
+            # preserve the empty line
+            print()
+            continue
+        if options.warc:
+          print(L.rstrip())
+        continue
+      if state==1:
+        # HTTP header
+        wl -= len(L)
+        if L.startswith("Content-Length: "):
+          bl=int(L[16:].rstrip())
+        elif L=="" or L.startswith("\r"):
+          if not options.body:
+            return
+          state=2
+          if options.headers:
+            # preserve the empty line
+            print()
+          if bl is not None:
+            if bl!=wl:
+              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
+                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
+          continue
+        if options.headers:
+          print(L.rstrip())
+        continue
+      # HTTP body
+      if options.body:
+        sys.stdout.flush()
+        BINOUT.write(cb[clear_text.tell():])
+      return
 
 def main():
   parser = argparse.ArgumentParser(