Mercurial > hg > cc > cirrus_work
changeset 284:e461601592dd
try to do the whole thing in one go
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Mar 2025 21:17:47 +0000 |
parents | 6739e08d19ff |
children | 0ec17b2aab72 |
files | lib/python/cc/lmh/warc2cdb.py |
diffstat | 1 files changed, 78 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/warc2cdb.py Fri Mar 07 21:17:47 2025 +0000 @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +'''Produce cdb_input-style files from warc responses with lmh header value + + Usage: warc2cdb.py CC-date segment output-dir''' + +import re,warc,sys,glob,codecs,os.path +import cython, typing +import email.utils + +TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) +LMPAT: typing.Pattern[cython.bytes] = re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) +FFPAT: typing.Pattern[cython.bytes] = re.compile(b'([^ ])GMT$') + +DTAB: cython.bytes = bytearray(range(256)) +DDEL: cython.bytes = b'TZ-:' + +URI: cython.bytes +DATE: cython.bytes +OUT: typing.BinaryIO + +def LMHline(wtype: cython.bytes, buf: char[::1] , part: int) -> None: + global URI, DATE + m: typing.Match[cython.bytes] | None + mm: typing.Match[cython.bytes] | None + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + if (md:=DPAT.search(buf)): + DATE=md[1] + else: + raise ValueError(b"No date in %s ??"%buf) + else: + mm=LMPAT.search(buf) + if mm: + dateTime=mm[1] + if dateTime.endswith(b'GMT'): + dateTime = FFPAT.sub('\\1 GMT',dateTime) + try: + try: + lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp()) + except OverflowError: + lmi = 32535215999 + except (TypeError,IndexError,ValueError) as e: + print(dateTime.rstrip(),e,sep='\t',file=sys.stderr) + return + DATE=(DATE.translate(DTAB,DDEL)) + OUT.write(b'+') + OUT.write(b'%d'%(len(DATE)+len(URI))) + OUT.write(b',') + OUT.write(b'%d'%len(lmi)) + OUT.write(b':') + OUT.write(DATE) + OUT.write(URI) + OUT.write(b'->') + OUT.write(lmi) + OUT.write(b'\n') + +def main(CCdate, segment, nFiles, outdir): + global OUT + + infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00???.warc.gz'%( + CCdate, segment) + + with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT: + for infile_name in glob.glob(infile_pat): + print(infile_name,file=sys.stderr) + warc.warc(infile_name,LMHline,[b'response'],parts=3) + OUT.write(b'\n') + print(outfile_name) + +if __name__ == '__main__': + sys.exit(main(*sys.argv[1:])) + + +