Mercurial > hg > cc > cirrus_home
changeset 104:61122560ae0c
-x barely working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 19 Apr 2021 18:09:51 +0000 |
parents | d354e29c91a5 |
children | baf56ff538f8 |
files | bin/ix.py |
diffstat | 1 files changed, 16 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Mon Apr 19 18:09:25 2021 +0000 +++ b/bin/ix.py Mon Apr 19 18:09:51 2021 +0000 @@ -18,16 +18,16 @@ return HACK_USAGE.sub('\n [ ( -x | length offset filename ) ]', FOO) -def process(options,buf,file,offset,length): - whole=not (options.warc or options.headers or options.body) +def process(options,buf,file,offset,length,whole): if whole: file.seek(offset) bv=memoryview(buf)[:length] nb=file.readinto(bv) if nb!=length: - print("losing",file.name,name,length,nb,file=sys.stderr) + print("losing",file.name,length,nb,file=sys.stderr) exit(1) sys.stdout.buffer.write(bv) + file.close() def main(): parser = argparse.ArgumentParser( @@ -70,7 +70,7 @@ parser._action_groups.sort(key=lambda g:g.title) #parser.print_help() pa=parser.parse_args(sys.argv[1:]) - print(pa,file=sys.stderr) + #print(pa,file=sys.stderr) if pa.length is not None: # We have to enforce our own check.. if pa.offset is None or pa.filename is None: @@ -78,9 +78,19 @@ buf=bytearray(1024*1024) + whole=not (pa.warc or pa.headers or pa.body) if pa.length is not None: - process(pa,buf,pa.filename,pa.offset,pa.length) + process(pa,buf,pa.filename,pa.offset,pa.length,whole) exit(0) - + if pa.index: + CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') + for l in sys.stdin: + m=CDX.search(l) + if m is None: + print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) + exit(2) + f="%s/%s/%s"%(m[3],m[4],m[5]) + process(pa,buf,open(f,'rb',0),int(m[2]),int(m[1]),whole) + exit(0) if __name__ == "__main__": main()