changeset 104:61122560ae0c

-x barely working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 19 Apr 2021 18:09:51 +0000
parents d354e29c91a5
children baf56ff538f8
files bin/ix.py
diffstat 1 files changed, 16 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Mon Apr 19 18:09:25 2021 +0000
+++ b/bin/ix.py	Mon Apr 19 18:09:51 2021 +0000
@@ -18,16 +18,16 @@
     return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
                           FOO)
 
-def process(options,buf,file,offset,length):
-  whole=not (options.warc or options.headers or options.body)
+def process(options,buf,file,offset,length,whole):
   if whole:
     file.seek(offset)
     bv=memoryview(buf)[:length]
     nb=file.readinto(bv)
     if nb!=length:
-      print("losing",file.name,name,length,nb,file=sys.stderr)
+      print("losing",file.name,length,nb,file=sys.stderr)
       exit(1)
     sys.stdout.buffer.write(bv)
+  file.close()
 
 def main():
   parser = argparse.ArgumentParser(
@@ -70,7 +70,7 @@
   parser._action_groups.sort(key=lambda g:g.title)
   #parser.print_help()
   pa=parser.parse_args(sys.argv[1:])
-  print(pa,file=sys.stderr)
+  #print(pa,file=sys.stderr)
   if pa.length is not None:
     # We have to enforce our own check..
     if pa.offset is None or pa.filename is None:
@@ -78,9 +78,19 @@
  
   buf=bytearray(1024*1024)
 
+  whole=not (pa.warc or pa.headers or pa.body)
   if pa.length is not None:
-    process(pa,buf,pa.filename,pa.offset,pa.length)
+    process(pa,buf,pa.filename,pa.offset,pa.length,whole)
     exit(0)
-
+  if pa.index:
+    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
+    for l in sys.stdin:
+      m=CDX.search(l)
+      if m is None:
+        print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
+        exit(2)
+      f="%s/%s/%s"%(m[3],m[4],m[5])
+      process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole)
+    exit(0)
 if __name__ == "__main__":
     main()