changeset 298:fdec28613df3 default

robotstxt now working?
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 05 May 2025 20:57:46 +0100
parents 5e08e6db47ad
children 83c7ecd61ecf
files lib/python/cc/ix.py
diffstat 1 files changed, 1 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/ix.py	Mon May 05 20:57:30 2025 +0100
+++ b/lib/python/cc/ix.py	Mon May 05 20:57:46 2025 +0100
@@ -284,7 +284,7 @@
       launch(pa.cmd)
   # three different ways to process
   if pa.index:
-    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet...
+    CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics|robotstxt)/(.*\.gz)"') # robotstxt works?
     for l in sys.stdin:
       m=CDX.search(l)
       if m is None: