Mercurial > hg > cc > cirrus_work
changeset 286:147f648e4e5e
trying to recover from partial, not-ordered, run of segs 0--7
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Mon, 24 Mar 2025 14:30:32 +0000 |
| parents | 0ec17b2aab72 |
| children | fe78af4ea7c5 |
| files | lib/python/cc/lmh/hack.py lib/python/cc/lmh/warc2cdb.py |
| diffstat | 2 files changed, 33 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/hack.py Mon Mar 24 14:30:32 2025 +0000 @@ -0,0 +1,27 @@ +#!/usr/bin/python3 +import sys, re +p = re.compile('([0-9]*)a([0-9]*)(,([0-9]*))?') +f=True +for l in sys.stdin: + if (m := p.match(l)): + if f: + sys.stdout.write('{') + f=False + else: + sys.stdout.write(',') + if m: + b=sys.stdin.readline()[4:] + if m[3]: + i=int(m[4])-int(m[2]) + while i: + e=sys.stdin.readline() + i-=1 + sys.stdout.write('{%s..%s}'%(b[:-1],e[4:][:-1])) + else: + sys.stdout.write(b[:-1]) + else: + print('no match',l,m,file=sys.stderr) + exit(1) +sys.stdout.write('}') + +
--- a/lib/python/cc/lmh/warc2cdb.py Sat Mar 08 22:31:14 2025 +0000 +++ b/lib/python/cc/lmh/warc2cdb.py Mon Mar 24 14:30:32 2025 +0000 @@ -7,6 +7,7 @@ import cython, typing import email.utils from urllib.parse import quote +import subprocess TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE) @@ -64,14 +65,15 @@ OUT.write(lmi) OUT.write(b'\n') -def main(CCdate, segment, nFiles, outdir): +def main(CCdate, segment, outdir, fpat="???"): global OUT - infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00???.warc.gz'%( - CCdate, segment) + infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%( + CCdate, segment, fpat) with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT: - for infile_name in glob.glob(infile_pat): + for infile_name in subprocess.run(infile_pat, shell=True, + stdout=subprocess.PIPE).stdout.decode('utf8').split(): print(infile_name,file=sys.stderr) warc.warc(infile_name,LMHline,[b'response'],parts=3) OUT.write(b'\n')
