comparison lib/python/cdx_segment.py @ 86:b5fef78cbb26

working for -t 2 -c 2
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 15 Mar 2021 14:26:42 +0000
parents
children b6a5999d8e06
comparison
equal deleted inserted replaced
85:e5d5958bf3fe 86:b5fef78cbb26
1 #!/usr/bin/python3
2 '''Split out a alphabetical cdx file by segment
3 Usage: cdx_segment.py archive segment-prefix idx_in
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
5 cdx/warc
6 [all segments, all and only those paths matching segment-prefix*.{0..99}]
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
8 '''
9 import gzip
10 from os import listdir, makedirs
11 from datetime import datetime
12 import sys,re
13
14 archive="CC-MAIN-%s"%sys.argv[1]
15 adir="/beegfs/common_crawl/%s"%archive
16 apref="crawl-data/%s"%archive
17 pref=sys.argv[2]
18
19 afn=sys.argv[3]
20
21 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
22 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)
23
24 segdirs=[d for d in listdir(adir) if SPAT.match(d)]
25 ss={}
26 n={}
27 for r in ("warc","robotstxt","crawldiagnostics"):
28 ss[r]=rd=dict()
29 n[r]=0
30 for s in segdirs:
31 rdir="%s/%s/orig/cdx/%s"%(adir,s,r)
32 makedirs(rdir,0o755,exist_ok=True)
33 rd[s]=open("%s/cdx"%rdir,'at')
34
35 idir="%s/cdx/warc"%adir
36
37 e=0
38
39 st=datetime.now()
40 print(st,"starting",afn,file=sys.stderr)
41
42 with gzip.open("%s/%s"%(idir,afn),'rt') as f:
43 for l in f:
44 m=IPAT.search(l)
45 if m:
46 r=m[2]
47 ss[r][m[1]].write(l)
48 n[r]+=1
49 else:
50 sys.stderr.write("bogus: ",afn,l)
51 e+=1
52
53 for gg in ss.values():
54 for g in gg.values():
55 g.close()
56
57 et=datetime.now()
58 print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())),
59 e,(et-st).seconds),file=sys.stderr)
60