view lib/python/cdx_segment.py @ 86:b5fef78cbb26

working for -t 2 -c 2
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 15 Mar 2021 14:26:42 +0000
parents
children b6a5999d8e06
line wrap: on
line source

#!/usr/bin/python3
'''Split out a alphabetical cdx file by segment
Usage: cdx_segment.py archive segment-prefix idx_in
 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
  cdx/warc
  [all segments, all and only those paths matching segment-prefix*.{0..99}]
 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
'''
import gzip
from os import listdir, makedirs
from datetime import datetime
import sys,re

archive="CC-MAIN-%s"%sys.argv[1]
adir="/beegfs/common_crawl/%s"%archive
apref="crawl-data/%s"%archive
pref=sys.argv[2]

afn=sys.argv[3]

SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)

segdirs=[d for d in listdir(adir) if SPAT.match(d)]
ss={}
n={}
for r in ("warc","robotstxt","crawldiagnostics"):
  ss[r]=rd=dict()
  n[r]=0
  for s in segdirs:
    rdir="%s/%s/orig/cdx/%s"%(adir,s,r)
    makedirs(rdir,0o755,exist_ok=True)
    rd[s]=open("%s/cdx"%rdir,'at')

idir="%s/cdx/warc"%adir

e=0

st=datetime.now()
print(st,"starting",afn,file=sys.stderr)

with gzip.open("%s/%s"%(idir,afn),'rt') as f:
  for l in f:
    m=IPAT.search(l)
    if m:
      r=m[2]
      ss[r][m[1]].write(l)
      n[r]+=1
    else:
      sys.stderr.write("bogus: ",afn,l)
      e+=1

for gg in ss.values():
  for g in gg.values():
    g.close()

et=datetime.now()
print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())),
                                                               e,(et-st).seconds),file=sys.stderr)