changeset 105:baf56ff538f8

convert to rich directory structure per 2019-35
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 20 Apr 2021 11:12:35 +0000
parents 61122560ae0c
children 815b33c3254a
files bin/ix.py bin/ix.sh
diffstat 2 files changed, 19 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Mon Apr 19 18:09:51 2021 +0000
+++ b/bin/ix.py	Tue Apr 20 11:12:35 2021 +0000
@@ -1,15 +1,18 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 '''Extract request records from Common Crawl WARC-format files
-given length, offset and file triples.
+given length, offset and filename triples.
 Input one triple on command line, or
 triples from stdin as tab-delimited lines
 or complete cdx index lines.
+In all cases by 'filename' is meant crawlid/segmentid/filename
 
 Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.'''
 
 import sys, argparse, regex
 
 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
+FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s"
+BINOUT=sys.stdout.buffer
 
 class HackFormat(argparse.RawDescriptionHelpFormatter):
   def format_help(self):
@@ -18,15 +21,15 @@
     return HACK_USAGE.sub('\n             [ ( -x | length offset filename ) ]',
                           FOO)
 
-def process(options,buf,file,offset,length,whole):
+def process(options,buf,filename,offset,length,whole):
+  file=open(filename,'rb',0)
   if whole:
     file.seek(offset)
     bv=memoryview(buf)[:length]
     nb=file.readinto(bv)
     if nb!=length:
       print("losing",file.name,length,nb,file=sys.stderr)
-      exit(1)
-    sys.stdout.buffer.write(bv)
+    BINOUT.write(bv)
   file.close()
 
 def main():
@@ -34,7 +37,8 @@
     description='''Extract records from warc files given length, offset and file triples.
   Input one triple on command line, or
   triples from stdin as tab-delimited lines
-  or complete cdx index lines.''',
+  or complete cdx index lines.
+  In all cases by 'filename' is meant crawlid/segmentid/filename''',
     epilog='''Note that if no output flag(s) is/are given,
   the whole WARC record will be output, more efficiently than
   would be the case if all three flags were given.''',
@@ -63,24 +67,23 @@
                       help='start position in bytes of gzipped record',
                       nargs='?')
   parser.add_argument('filename',
-                      help='name of gzipped Common Crawl WARC-format file',
-                      nargs='?',
-                      type=argparse.FileType('rb',0))
+                      help='pathname of gzipped Common Crawl WARC-format file',
+                      nargs='?')
   # Hack the order of optional and positional in the help output
   parser._action_groups.sort(key=lambda g:g.title)
   #parser.print_help()
   pa=parser.parse_args(sys.argv[1:])
-  #print(pa,file=sys.stderr)
+  print(pa,file=sys.stderr)
   if pa.length is not None:
     # We have to enforce our own check..
     if pa.offset is None or pa.filename is None:
       parser.error("length, offset and filename must all be supplied together")
  
-  buf=bytearray(1024*1024)
+  buf=bytearray(2024*1024)
 
   whole=not (pa.warc or pa.headers or pa.body)
   if pa.length is not None:
-    process(pa,buf,pa.filename,pa.offset,pa.length,whole)
+    process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole)
     exit(0)
   if pa.index:
     CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
@@ -89,7 +92,7 @@
       if m is None:
         print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
         exit(2)
-      f="%s/%s/%s"%(m[3],m[4],m[5])
+      f=FPAT%(m[3:6])
       process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole)
     exit(0)
 if __name__ == "__main__":
--- a/bin/ix.sh	Mon Apr 19 18:09:51 2021 +0000
+++ b/bin/ix.sh	Tue Apr 20 11:12:35 2021 +0000
@@ -52,8 +52,10 @@
 else
     cat
 fi | \
-while { IFS=$'\t' read l o f; }
+while { IFS=$'\t' read l o wf; }
 do
+  ff=($(echo $wf | tr '/' ' '))
+  f="/beegfs/common_crawl/${ff[0]}/${ff[1]}/orig/warc/${ff[2]}"
   if [ -z "$d" ]
   then
       dd if="$f" of=/dev/stdout skip=$o count=$l \