Mercurial > hg > cc > cirrus_home
changeset 123:5b0ec642ee9b
silently skip robotstxt
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Jun 2021 17:16:15 +0000 |
parents | 9de06ae73372 |
children | 906b3b0b028d |
files | bin/ix.py |
diffstat | 1 files changed, 5 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/ix.py Mon Jun 28 17:15:19 2021 +0000 +++ b/bin/ix.py Mon Jun 28 17:16:15 2021 +0000 @@ -15,7 +15,7 @@ HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') BINOUT=sys.stdout.buffer -FPAT="/%s/%s/orig/warc/%s" +FPAT="/%s/%s/orig/%s/%s" class HackFormat(argparse.RawDescriptionHelpFormatter): def format_help(self): @@ -200,13 +200,15 @@ else: _output = _output_stdout if pa.index: - CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') + CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... for l in sys.stdin: m=CDX.search(l) if m is None: + if l.find('/robotstxt/')>-1: + continue print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) exit(2) - f=FPAT%(m[3:6]) + f=FPAT%(m[3:7]) process(pa,buf,f, int(m[2]),int(m[1]),whole) elif pa.length is not None: