Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 123:5b0ec642ee9b
silently skip robotstxt
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 28 Jun 2021 17:16:15 +0000 |
parents | bc958b776fb8 |
children | b51d65ed6c89 |
comparison
equal
deleted
inserted
replaced
122:9de06ae73372 | 123:5b0ec642ee9b |
---|---|
13 from subprocess import Popen, PIPE | 13 from subprocess import Popen, PIPE |
14 #import asyncio | 14 #import asyncio |
15 | 15 |
16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') | 16 HACK_USAGE=regex.compile('\[-x\]\n\s*\[length\] \[offset\] \[filename\]') |
17 BINOUT=sys.stdout.buffer | 17 BINOUT=sys.stdout.buffer |
18 FPAT="/%s/%s/orig/warc/%s" | 18 FPAT="/%s/%s/orig/%s/%s" |
19 | 19 |
20 class HackFormat(argparse.RawDescriptionHelpFormatter): | 20 class HackFormat(argparse.RawDescriptionHelpFormatter): |
21 def format_help(self): | 21 def format_help(self): |
22 global FOO | 22 global FOO |
23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) | 23 FOO=argparse.RawDescriptionHelpFormatter.format_help(self) |
198 if pa.cmd: | 198 if pa.cmd: |
199 _output = _output_subproc | 199 _output = _output_subproc |
200 else: | 200 else: |
201 _output = _output_stdout | 201 _output = _output_stdout |
202 if pa.index: | 202 if pa.index: |
203 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/warc/(.*\.gz)"') | 203 CDX=regex.compile('length": "([0-9]*)", "offset": "([0-9]*)", "filename": "crawl-data/([^/]*)/segments/([^/]*)/(warc|crawldiagnostics)/(.*\.gz)"') # no robotstxt yet... |
204 for l in sys.stdin: | 204 for l in sys.stdin: |
205 m=CDX.search(l) | 205 m=CDX.search(l) |
206 if m is None: | 206 if m is None: |
207 if l.find('/robotstxt/')>-1: | |
208 continue | |
207 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) | 209 print("index line problem: \"%s\""%l.lstrip(),file=sys.stderr) |
208 exit(2) | 210 exit(2) |
209 f=FPAT%(m[3:6]) | 211 f=FPAT%(m[3:7]) |
210 process(pa,buf,f, | 212 process(pa,buf,f, |
211 int(m[2]),int(m[1]),whole) | 213 int(m[2]),int(m[1]),whole) |
212 elif pa.length is not None: | 214 elif pa.length is not None: |
213 print(pa.filename,file=sys.stderr) | 215 print(pa.filename,file=sys.stderr) |
214 process(pa,buf,FPAT%tuple(pa.filename.split('/')), | 216 process(pa,buf,FPAT%tuple(pa.filename.split('/')), |