Mercurial > hg > cc > cirrus_work
changeset 297:5e08e6db47ad
add another digit or two (segment #) to key for r_t
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Mon, 05 May 2025 20:57:30 +0100 |
| parents | 9608fe0628b0 |
| children | fdec28613df3 |
| files | lib/python/cc/lmh/warc2cdb.py |
| diffstat | 1 files changed, 21 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py Mon May 05 20:39:16 2025 +0100 +++ b/lib/python/cc/lmh/warc2cdb.py Mon May 05 20:57:30 2025 +0100 @@ -20,6 +20,8 @@ URI: bytes DATE: bytes OUT: typing.BinaryIO +SEG: bytes +R_T: boolean = False WIN: int = 0 LOSE: int = 0 @@ -39,11 +41,12 @@ codecs.register_error('java_unicode',java_unicode_encode) -def LMHline(wtype: int, buf: memoryview , part: int) -> None: +def LMHline(wtype: int, buf: memoryview, part: int) -> None: global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE - global N, UERRS + global N, UERRS, SEG, R_T m: typing.Match[cython.bytes] | None mm: typing.Match[cython.bytes] | None + lmi: cython.bytes if part==1: if (m:=TUPAT.search(buf)): URI=m[1] @@ -65,7 +68,7 @@ try: lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) except OverflowError: - lmi = 32535215999 + lmi = b'32535215999' except (TypeError,IndexError,ValueError) as e: print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) LOSE += 1 @@ -78,26 +81,26 @@ UERRS += 1 # Try just fixing the non-ASCII: URI = URI.decode('utf-8').encode('ascii', errors='java_unicode') - l = len(lmi) - if wtype == warc.REVISIT: - whereami = b" need to fill this in some how" - l += len(whereami) + l: int = len(lmi) + kl: int = (len(DATE)+len(URI)+(len(SEG) if R_T else 0)) OUT.write(b'+') - OUT.write(b'%d'%(len(DATE)+len(URI))) + OUT.write(b'%d'%kl) OUT.write(b',') OUT.write(b'%d'%l) OUT.write(b':') OUT.write(DATE) + if R_T: + OUT.write(SEG) OUT.write(URI) OUT.write(b'->') OUT.write(lmi) - if wtype == warc.REVISIT: - OUT.write(whereami) OUT.write(b'\n') def main(CCdate, segment, outdir, subdir = 'warc', fpat = None): - global OUT, N, WIN, LOSE, UERRS, URI, DATE + global OUT, N, WIN, LOSE, UERRS, URI, DATE, SEG, R_T + SEG = segment.encode('utf8') + R_T = (subdir == 'robotstxt') infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%( CCdate, segment, subdir, ("???" if fpat is None else ( (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat)))) @@ -107,7 +110,13 @@ stdout=subprocess.PIPE).stdout.decode('utf8').split(): print(infile_name,file=sys.stderr) WIN = LOSE = N = UERRS = 0 - warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3) + if subdir in ['warc','robotstxt']: + warc.warc(infile_name,LMHline,[warc.RESP],parts=3) + elif subdir == 'crawldiagnostics': + warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3) + else: + print('bogus type %s'%subdir,file=sys.stderr) + exit(1) print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS), file=sys.stderr) OUT.write(b'\n')
