Mercurial > hg > cc > cirrus_work
changeset 299:83c7ecd61ecf trim
try trimming various more-or-less constant bits of the key and value
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Tue, 06 May 2025 16:52:32 +0100 |
| parents | fdec28613df3 |
| children | 1c11117bb01b |
| files | lib/python/cc/lmh/warc2cdb.py |
| diffstat | 1 files changed, 34 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py Mon May 05 20:57:46 2025 +0100 +++ b/lib/python/cc/lmh/warc2cdb.py Tue May 06 16:52:32 2025 +0100 @@ -17,16 +17,20 @@ DTAB: bytearray = bytearray(range(256)) DDEL: bytes = b'TZ-:' +OUT: typing.BinaryIO +SEG: bytes +R_T: bool = False + URI: bytes DATE: bytes -OUT: typing.BinaryIO -SEG: bytes -R_T: boolean = False WIN: int = 0 LOSE: int = 0 N: int = 0 UERRS: int = 0 +NON_HTTP: int = 0 +NON_MONTH: int = 0 +NON_ERA: int = 0 def _u_esc(c): if c<65536: @@ -42,8 +46,9 @@ codecs.register_error('java_unicode',java_unicode_encode) def LMHline(wtype: int, buf: memoryview, part: int) -> None: - global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE - global N, UERRS, SEG, R_T + global TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, OUT, WIN, LOSE, NON_HTTP, NON_MONTH + global N, UERRS, SEG, R_T, LM_ERA, LM_ERA_L, C_MONTH, C_MONTH_L, NON_ERA + global DATE, URI m: typing.Match[cython.bytes] | None mm: typing.Match[cython.bytes] | None lmi: cython.bytes @@ -67,6 +72,11 @@ try: try: lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) + if LM_ERA: + if lmi.startswith(LM_ERA): # save 2 bytes in ~80% of cases + lmi=b'0'+lmi[LM_ERA_L:] + else: + NON_ERA += 1 except OverflowError: lmi = b'32535215999' except (TypeError,IndexError,ValueError) as e: @@ -74,6 +84,11 @@ LOSE += 1 return DATE=(DATE.translate(DTAB,DDEL)) + if C_MONTH: + if DATE.startswith(C_MONTH): + DATE=DATE[C_MONTH_L:] + else: + NON_MONTH += 1 WIN += 1 try: URI.decode('ascii') @@ -81,6 +96,11 @@ UERRS += 1 # Try just fixing the non-ASCII: URI = URI.decode('utf-8').encode('ascii', errors='java_unicode') + # Could just assume http, but let's check + if URI.startswith(b'http'): + URI=URI[4:] + else: + NON_HTTP += 1 l: int = len(lmi) kl: int = (len(DATE)+len(URI)+(len(SEG) if R_T else 0)) OUT.write(b'+') @@ -96,11 +116,16 @@ OUT.write(lmi) OUT.write(b'\n') -def main(CCdate, segment, outdir, subdir = 'warc', fpat = None): - global OUT, N, WIN, LOSE, UERRS, URI, DATE, SEG, R_T +def main(CCdate, segment, outdir, subdir = 'warc', fpat = None, era = '', month = '' ): + global OUT, N, WIN, LOSE, UERRS, SEG, R_T + global NON_HTTP, C_MONTH, C_MONTH_L, NON_MONTH, LM_ERA, LM_ERA_L, NON_ERA SEG = segment.encode('utf8') R_T = (subdir == 'robotstxt') + LM_ERA = era.encode('utf8') + LM_ERA_L = len(era) + C_MONTH = month.encode('utf8') + C_MONTH_L = len(month) infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%( CCdate, segment, subdir, ("???" if fpat is None else ( (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat)))) @@ -109,7 +134,7 @@ for infile_name in subprocess.run(infile_pat, shell=True, stdout=subprocess.PIPE).stdout.decode('utf8').split(): print(infile_name,file=sys.stderr) - WIN = LOSE = N = UERRS = 0 + WIN = LOSE = N = UERRS = NON_HTTP = NON_MONTH = NON_ERA = 0 if subdir in ['warc','robotstxt']: warc.warc(infile_name,LMHline,[warc.RESP],parts=3) elif subdir == 'crawldiagnostics': @@ -117,7 +142,7 @@ else: print('bogus type %s'%subdir,file=sys.stderr) exit(1) - print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS), + print('%d LM headers, %d win, %d lose, %d non-ASCII URIs, %d non-close, %d dodgy schemes, %d dodgy WARC dates'%(N,WIN,LOSE,UERRS,NON_ERA,NON_HTTP,NON_MONTH), file=sys.stderr) OUT.write(b'\n')
