# HG changeset patch # User Henry S. Thompson # Date 1618917155 0 # Node ID baf56ff538f8e3728755481df5bf5444efdc5136 # Parent 61122560ae0c42ffd53b51c7b0c502a8548ffdd5 convert to rich directory structure per 2019-35 diff -r 61122560ae0c -r baf56ff538f8 bin/ix.py --- a/bin/ix.py Mon Apr 19 18:09:51 2021 +0000 +++ b/bin/ix.py Tue Apr 20 11:12:35 2021 +0000 @@ -1,15 +1,18 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 '''Extract request records from Common Crawl WARC-format files -given length, offset and file triples. +given length, offset and filename triples. Input one triple on command line, or triples from stdin as tab-delimited lines or complete cdx index lines. +In all cases by 'filename' is meant crawlid/segmentid/filename Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if -whb is given.''' import sys, argparse, regex HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') +FPAT="/beegfs/common_crawl/%s/%s/orig/warc/%s" +BINOUT=sys.stdout.buffer class HackFormat(argparse.RawDescriptionHelpFormatter): def format_help(self): @@ -18,15 +21,15 @@ return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', FOO) -def process(options,buf,file,offset,length,whole): +def process(options,buf,filename,offset,length,whole): + file=open(filename,'rb',0) if whole: file.seek(offset) bv=memoryview(buf)[:length] nb=file.readinto(bv) if nb!=length: print("losing",file.name,length,nb,file=sys.stderr) - exit(1) - sys.stdout.buffer.write(bv) + BINOUT.write(bv) file.close() def main(): @@ -34,7 +37,8 @@ description='''Extract records from warc files given length, offset and file triples. Input one triple on command line, or triples from stdin as tab-delimited lines - or complete cdx index lines.''', + or complete cdx index lines. + In all cases by 'filename' is meant crawlid/segmentid/filename''', epilog='''Note that if no output flag(s) is/are given, the whole WARC record will be output, more efficiently than would be the case if all three flags were given.''', @@ -63,24 +67,23 @@ help='start position in bytes of gzipped record', nargs='?') parser.add_argument('filename', - help='name of gzipped Common Crawl WARC-format file', - nargs='?', - type=argparse.FileType('rb',0)) + help='pathname of gzipped Common Crawl WARC-format file', + nargs='?') # Hack the order of optional and positional in the help output parser._action_groups.sort(key=lambda g:g.title) #parser.print_help() pa=parser.parse_args(sys.argv[1:]) - #print(pa,file=sys.stderr) + print(pa,file=sys.stderr) if pa.length is not None: # We have to enforce our own check.. if pa.offset is None or pa.filename is None: parser.error("length, offset and filename must all be supplied together") - buf=bytearray(1024*1024) + buf=bytearray(2024*1024) whole=not (pa.warc or pa.headers or pa.body) if pa.length is not None: - process(pa,buf,pa.filename,pa.offset,pa.length,whole) + process(pa,buf,FPAT%tuple(pa.filename.split('/')),pa.offset,pa.length,whole) exit(0) if pa.index: CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') @@ -89,7 +92,7 @@ if m is None: print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) exit(2) - f="%s/%s/%s"%(m[3],m[4],m[5]) + f=FPAT%(m[3:6]) process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole) exit(0) if __name__ == "__main__": diff -r 61122560ae0c -r baf56ff538f8 bin/ix.sh --- a/bin/ix.sh Mon Apr 19 18:09:51 2021 +0000 +++ b/bin/ix.sh Tue Apr 20 11:12:35 2021 +0000 @@ -52,8 +52,10 @@ else cat fi | \ -while { IFS=$'\t' read l o f; } +while { IFS=$'\t' read l o wf; } do + ff=($(echo $wf | tr '/' ' ')) + f="/beegfs/common_crawl/${ff[0]}/${ff[1]}/orig/warc/${ff[2]}" if [ -z "$d" ] then dd if="$f" of=/dev/stdout skip=$o count=$l \