# HG changeset patch # User Henry S. Thompson # Date 1624900575 0 # Node ID 5b0ec642ee9be024e28ac4390629e1c121b0ef58 # Parent 9de06ae73372a345612e8f87dc9255c5d1d4d6a4 silently skip robotstxt diff -r 9de06ae73372 -r 5b0ec642ee9b bin/ix.py --- a/bin/ix.py Mon Jun 28 17:15:19 2021 +0000 +++ b/bin/ix.py Mon Jun 28 17:16:15 2021 +0000 @@ -15,7 +15,7 @@ HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') BINOUT=sys.stdout.buffer -FPAT="/%s/%s/orig/warc/%s" +FPAT="/%s/%s/orig/%s/%s" class HackFormat(argparse.RawDescriptionHelpFormatter): def format_help(self): @@ -200,13 +200,15 @@ else: _output = _output_stdout if pa.index: - CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') + CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... for l in sys.stdin: m=CDX.search(l) if m is None: + if l.find('/robotstxt/')>-1: + continue print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) exit(2) - f=FPAT%(m[3:6]) + f=FPAT%(m[3:7]) process(pa,buf,f, int(m[2]),int(m[1]),whole) elif pa.length is not None: