annotate lib/python/cdx_segment.py @ 86:b5fef78cbb26

working for -t 2 -c 2
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 15 Mar 2021 14:26:42 +0000
parents
children b6a5999d8e06
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Split out a alphabetical cdx file by segment
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 Usage: cdx_segment.py archive segment-prefix idx_in
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 [all segments, all and only those paths matching segment-prefix*.{0..99}]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 '''
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 import gzip
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 from os import listdir, makedirs
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from datetime import datetime
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 import sys,re
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 archive="CC-MAIN-%s"%sys.argv[1]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 adir="/beegfs/common_crawl/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 apref="crawl-data/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 pref=sys.argv[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 afn=sys.argv[3]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 segdirs=[d for d in listdir(adir) if SPAT.match(d)]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 ss={}
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 n={}
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 for r in ("warc","robotstxt","crawldiagnostics"):
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 ss[r]=rd=dict()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 n[r]=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 for s in segdirs:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 rdir="%s/%s/orig/cdx/%s"%(adir,s,r)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 makedirs(rdir,0o755,exist_ok=True)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 rd[s]=open("%s/cdx"%rdir,'at')
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 idir="%s/cdx/warc"%adir
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 e=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 st=datetime.now()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 print(st,"starting",afn,file=sys.stderr)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 with gzip.open("%s/%s"%(idir,afn),'rt') as f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 for l in f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 m=IPAT.search(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 if m:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 r=m[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 ss[r][m[1]].write(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 n[r]+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 else:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 sys.stderr.write("bogus: ",afn,l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 e+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 for gg in ss.values():
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 for g in gg.values():
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 g.close()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 et=datetime.now()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 print(et,"finished",afn,"%s ok, %d bogus, %d seconds elapsed"%(':'.join(map(str,n.values())),
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 e,(et-st).seconds),file=sys.stderr)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60