annotate lib/python/cdx_segment.py @ 134:d3ef00af2064

add usage/help info
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 14 Jul 2021 16:49:54 +0000
parents 464d2dfb99c9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/python3
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Split out a alphabetical cdx file by segment
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 Usage: cdx_segment.py archive segment-prefix idx_in
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 archive is e.g. 2019-35, assuming /beegfs/common_crawl/CC-MAIN-2019-35 has sub-directories for
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 [all segments, all and only those paths matching segment-prefix*.{0..99}]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 idx_in is an alphabetically ordered index fragment (one of cdx/warc/...gz), relative to archive/cdx/warc
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 '''
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
9 from os import listdir, makedirs, lseek, SEEK_END, SEEK_SET, read, write, fsync, system
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 from datetime import datetime
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
11 from random import sample
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
12 from lock import AtomicOpen
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
13 import sys,re,gzip
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 archive="CC-MAIN-%s"%sys.argv[1]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 adir="/beegfs/common_crawl/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 apref="crawl-data/%s"%archive
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 pref=sys.argv[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 afn=sys.argv[3]
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
21 ifn=afn.split('.')[0]
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 SPAT=re.compile("%s[0-9]*\\.[0-9]{1,2}$"%pref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 IPAT=re.compile('"filename": "%s/segments/([0-9.]*)/([a-z]*)/'%apref)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 segdirs=[d for d in listdir(adir) if SPAT.match(d)]
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
27 rr=("warc","robotstxt","crawldiagnostics")
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 ss={}
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 n={}
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
30 for r in rr:
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 ss[r]=rd=dict()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 n[r]=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 for s in segdirs:
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
34 rdir="%s/%s/orig/cdx/%s"%(ifn,s,r)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
35 makedirs(rdir,0o755)
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
36 rd[s]=open("%s/cdx"%rdir,'w+')
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 idir="%s/cdx/warc"%adir
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 e=0
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 st=datetime.now()
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 print(st,"starting",afn,file=sys.stderr)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 with gzip.open("%s/%s"%(idir,afn),'rt') as f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 for l in f:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 m=IPAT.search(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 if m:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 r=m[2]
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 ss[r][m[1]].write(l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 n[r]+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 else:
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 sys.stderr.write("bogus: ",afn,l)
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 e+=1
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
56 if True:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
57 # See note below, will have to copy entire result to /beegfs at shell level
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
58 for rr in ss.values():
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
59 for s in rr.values():
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
60 s.close()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
61 else:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
62 # The following fails, in that there are occasional small gaps in the result
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
63 # I've given up trying to figure out why...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
64 # Randomise to try to avoid contention
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
65 mt=datetime.now()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
66 print(mt,"copying",ifn,"%s ok, %d bogus, %d seconds so far"%(':'.join(map(str,n.values())),
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
67 e,(mt-st).seconds),file=sys.stderr)
87
b6a5999d8e06 working with locking and copying
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 86
diff changeset
68
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
69 for s in sample(segdirs,100):
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
70 for r in rr:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
71 of=ss[r][s]
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
72 of.flush()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
73 o=of.fileno()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
74 fsync(o)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
75 opos=lseek(o,0,SEEK_SET)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
76 with AtomicOpen("%s/%s/orig/cdx/%s/cdx"%(adir,s,r),"rb+") as df:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
77 d=df.fileno()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
78 dpos=lseek(d,0,SEEK_END)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
79 print(of.name,opos,df.name,dpos,file=sys.stderr)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
80 while True:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
81 data = read(o,131072)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
82 if data == b'': # end of file reached
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
83 break
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
84 write(d,data)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
85 of.close()
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
86
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
87 res=0 #system("rm -r %s"%ifn)
86
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88
b5fef78cbb26 working for -t 2 -c 2
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89 et=datetime.now()
88
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
90 print(et,"finished",ifn,"%s ok, %d bogus, %d seconds total"%(':'.join(map(str,n.values())),
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 87
diff changeset
91 e,(et-st).seconds),file=sys.stderr)