Mercurial > hg > cc > cirrus_home
view lib/python/cdx_segment.py @ 86:b5fef78cbb26
working for -t 2 -c 2
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 15 Mar 2021 14:26:42 +0000 |
parents | |
children | b6a5999d8e06 |
line wrap: on
line source
#!/usr/bin/python3 '''Split out a alphabetical cdx file by segment Usage: cdx_segment.py archive segment-prefix idx_in archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for cdx/warc [all segments, all and only those paths matching segment-prefix*.{0..99}] idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc ''' import gzip from os import listdir, makedirs from datetime import datetime import sys,re archive="CC-MAIN-%s"%sys.argv[1] adir="/beegfs/common_crawl/%s"%archive apref="crawl-data/%s"%archive pref=sys.argv[2] afn=sys.argv[3] SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref) IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref) segdirs=[d for d in listdir(adir) if SPAT.match(d)] ss={} n={} for r in ("warc","robotstxt","crawldiagnostics"): ss[r]=rd=dict() n[r]=0 for s in segdirs: rdir="%s/%s/orig/cdx/%s"%(adir,s,r) makedirs(rdir,0o755,exist_ok=True) rd[s]=open("%s/cdx"%rdir,'at') idir="%s/cdx/warc"%adir e=0 st=datetime.now() print(st,"starting",afn,file=sys.stderr) with gzip.open("%s/%s"%(idir,afn),'rt') as f: for l in f: m=IPAT.search(l) if m: r=m[2] ss[r][m[1]].write(l) n[r]+=1 else: sys.stderr.write("bogus: ",afn,l) e+=1 for gg in ss.values(): for g in gg.values(): g.close() et=datetime.now() print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())), e,(et-st).seconds),file=sys.stderr)