annotate lib/python/cdx_segment.py @ 87:b6a5999d8e06

working with locking and copying
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 16 Mar 2021 16:20:02 +0000
parents b5fef78cbb26
children 464d2dfb99c9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Split out a alphabetical cdx file by segment
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 Usage: cdx_segment.py archive segment-prefix idx_in
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 [all segments, all and only those paths matching segment-prefix*.{0..99}]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 '''
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
9 from os import listdir, makedirs, lseek, SEEK_END, SEEK_SET, read, write, fsync, system
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 from datetime import datetime
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
11 from random import sample
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
12 from lock import AtomicOpen
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
13 import sys,re,gzip
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 archive="CC-MAIN-%s"%sys.argv[1]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 adir="/beegfs/common_crawl/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 apref="crawl-data/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 pref=sys.argv[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 afn=sys.argv[3]
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
21 ifn=afn.split('.')[0]
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 segdirs=[d for d in listdir(adir) if SPAT.match(d)]
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
27 rr=("warc","robotstxt","crawldiagnostics")
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 ss={}
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 n={}
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
30 for r in rr:
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 ss[r]=rd=dict()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 n[r]=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 for s in segdirs:
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
34 rdir="%s/%s/orig/cdx/%s"%(ifn,s,r)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
35 makedirs(rdir,0o755)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
36 rd[s]=open("%s/cdx"%rdir,'w+')
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 idir="%s/cdx/warc"%adir
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 e=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 st=datetime.now()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 print(st,"starting",afn,file=sys.stderr)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 with gzip.open("%s/%s"%(idir,afn),'rt') as f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 for l in f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 m=IPAT.search(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 if m:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 r=m[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 ss[r][m[1]].write(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 n[r]+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 else:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 sys.stderr.write("bogus: ",afn,l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 e+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
56 mt=datetime.now()
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
57 print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())),
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
58 e,(mt-st).seconds),file=sys.stderr)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
59 # Randomise to try to avoid contention
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
60 for s in sample(segdirs,100):
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
61 for r in rr:
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
62 of=ss[r][s]
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
63 of.flush()
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
64 o=of.fileno()
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
65 fsync(o)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
66 with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df:
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
67 d=df.fileno()
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
68 while True:
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
69 data = read(o,131072)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
70 if data == b'': # end of file reached
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
71 break
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
72 write(d,data)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
73 of.close()
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
74
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
75 res=system("rm -r %s"%ifn)
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 et=datetime.now()
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
78 print(et,"finished",ifn,res,"%d seconds total"%((et-st).seconds),file=sys.stderr)