86
|
1 #!/usr/bin/python3
|
|
2 '''Split out a alphabetical cdx file by segment
|
|
3 Usage: cdx_segment.py archive segment-prefix idx_in
|
|
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
|
|
5 cdx/warc
|
|
6 [all segments, all and only those paths matching segment-prefix*.{0..99}]
|
|
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
|
|
8 '''
|
|
9 import gzip
|
|
10 from os import listdir, makedirs
|
|
11 from datetime import datetime
|
|
12 import sys,re
|
|
13
|
|
14 archive="CC-MAIN-%s"%sys.argv[1]
|
|
15 adir="/beegfs/common_crawl/%s"%archive
|
|
16 apref="crawl-data/%s"%archive
|
|
17 pref=sys.argv[2]
|
|
18
|
|
19 afn=sys.argv[3]
|
|
20
|
|
21 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
|
|
22 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)
|
|
23
|
|
24 segdirs=[d for d in listdir(adir) if SPAT.match(d)]
|
|
25 ss={}
|
|
26 n={}
|
|
27 for r in ("warc","robotstxt","crawldiagnostics"):
|
|
28 ss[r]=rd=dict()
|
|
29 n[r]=0
|
|
30 for s in segdirs:
|
|
31 rdir="%s/%s/orig/cdx/%s"%(adir,s,r)
|
|
32 makedirs(rdir,0o755,exist_ok=True)
|
|
33 rd[s]=open("%s/cdx"%rdir,'at')
|
|
34
|
|
35 idir="%s/cdx/warc"%adir
|
|
36
|
|
37 e=0
|
|
38
|
|
39 st=datetime.now()
|
|
40 print(st,"starting",afn,file=sys.stderr)
|
|
41
|
|
42 with gzip.open("%s/%s"%(idir,afn),'rt') as f:
|
|
43 for l in f:
|
|
44 m=IPAT.search(l)
|
|
45 if m:
|
|
46 r=m[2]
|
|
47 ss[r][m[1]].write(l)
|
|
48 n[r]+=1
|
|
49 else:
|
|
50 sys.stderr.write("bogus: ",afn,l)
|
|
51 e+=1
|
|
52
|
|
53 for gg in ss.values():
|
|
54 for g in gg.values():
|
|
55 g.close()
|
|
56
|
|
57 et=datetime.now()
|
|
58 print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())),
|
|
59 e,(et-st).seconds),file=sys.stderr)
|
|
60
|