Mercurial > hg > cc > cirrus_home
comparison lib/python/cdx_segment.py @ 86:b5fef78cbb26
working for -t 2 -c 2
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 15 Mar 2021 14:26:42 +0000 |
parents | |
children | b6a5999d8e06 |
comparison
equal
deleted
inserted
replaced
85:e5d5958bf3fe | 86:b5fef78cbb26 |
---|---|
1 #!/usr/bin/python3 | |
2 '''Split out a alphabetical cdx file by segment | |
3 Usage: cdx_segment.py archive segment-prefix idx_in | |
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for | |
5 cdx/warc | |
6 [all segments, all and only those paths matching segment-prefix*.{0..99}] | |
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc | |
8 ''' | |
9 import gzip | |
10 from os import listdir, makedirs | |
11 from datetime import datetime | |
12 import sys,re | |
13 | |
14 archive="CC-MAIN-%s"%sys.argv[1] | |
15 adir="/beegfs/common_crawl/%s"%archive | |
16 apref="crawl-data/%s"%archive | |
17 pref=sys.argv[2] | |
18 | |
19 afn=sys.argv[3] | |
20 | |
21 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref) | |
22 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref) | |
23 | |
24 segdirs=[d for d in listdir(adir) if SPAT.match(d)] | |
25 ss={} | |
26 n={} | |
27 for r in ("warc","robotstxt","crawldiagnostics"): | |
28 ss[r]=rd=dict() | |
29 n[r]=0 | |
30 for s in segdirs: | |
31 rdir="%s/%s/orig/cdx/%s"%(adir,s,r) | |
32 makedirs(rdir,0o755,exist_ok=True) | |
33 rd[s]=open("%s/cdx"%rdir,'at') | |
34 | |
35 idir="%s/cdx/warc"%adir | |
36 | |
37 e=0 | |
38 | |
39 st=datetime.now() | |
40 print(st,"starting",afn,file=sys.stderr) | |
41 | |
42 with gzip.open("%s/%s"%(idir,afn),'rt') as f: | |
43 for l in f: | |
44 m=IPAT.search(l) | |
45 if m: | |
46 r=m[2] | |
47 ss[r][m[1]].write(l) | |
48 n[r]+=1 | |
49 else: | |
50 sys.stderr.write("bogus: ",afn,l) | |
51 e+=1 | |
52 | |
53 for gg in ss.values(): | |
54 for g in gg.values(): | |
55 g.close() | |
56 | |
57 et=datetime.now() | |
58 print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())), | |
59 e,(et-st).seconds),file=sys.stderr) | |
60 |