changeset 123:5b0ec642ee9b

silently skip robotstxt
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 17:16:15 +0000
parents 9de06ae73372
children 906b3b0b028d
files bin/ix.py
diffstat 1 files changed, 5 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/bin/ix.py	Mon Jun 28 17:15:19 2021 +0000
+++ b/bin/ix.py	Mon Jun 28 17:16:15 2021 +0000
@@ -15,7 +15,7 @@
 
 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]')
 BINOUT=sys.stdout.buffer
-FPAT="/%s/%s/orig/warc/%s"
+FPAT="/%s/%s/orig/%s/%s"
 
 class HackFormat(argparse.RawDescriptionHelpFormatter):
   def format_help(self):
@@ -200,13 +200,15 @@
   else:
     _output = _output_stdout
   if pa.index:
-    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"')
+    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
     for l in sys.stdin:
       m=CDX.search(l)
       if m is None:
+        if l.find('/robotstxt/')>-1:
+          continue
         print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr)
         exit(2)
-      f=FPAT%(m[3:6])
+      f=FPAT%(m[3:7])
       process(pa,buf,f,
               int(m[2]),int(m[1]),whole)
   elif pa.length is not None: