annotate bin/getcc_multi.aws @ 158:5d1c3359e210

resurrect parallel fetch
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 24 Oct 2023 16:58:44 +0100
parents
children d0dbfefd6fc0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
158
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # courtesy wwaites, yossi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 ARCHIVE="$1"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 shift
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 if [ "$1" = -p ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 shift
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 nthreads=$1
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 shift
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 else
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 nthreads=2
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 SEGS="${1-all_segments}"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 wf=warc.paths
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 mkdir -p /beegfs/common_crawl/${ARCHIVE}
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 cd /beegfs/common_crawl/${ARCHIVE}
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 if [ ! -f $wf ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 if [ ! -f $SEGS ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 cut -f 4 -d / $wf |uniq > all_segments
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 SEGS=all_segments
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 if [ ! -s "$(ls segment_* | head -1)" ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 n=$(cat $SEGS | wc -l)
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 m=$((n / 8))
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 split -n l/$m $SEGS segment_
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 #export PASSPHRASE="annebooththompson"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 #export AWS_RETRY_MODE=adaptive
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 #export AWS_MAX_ATTEMPTS=100
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 # Set these in ~/.aws/credentials as follows
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 # [hst]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 # aws_secret_access_key = WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 # And these in ~/.aws/config as follows
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 # [profile hst]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 # retry_mode = adaptive
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 # max_attempts = 100
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 # s3 =
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 # multipart_threshold = 4GB
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 # max_concurrent_requests = 1
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 # multipart_chunksize = 32MB
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 echo args $ARCHIVE $nthreads 1>&2
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 for sf in segment_*
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60 do
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 echo sf $sf
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 for s in $(cat $sf)
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 do
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 echo s $s
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 mkdir -p $s
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 parallel --will-cite -j $nthreads \
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 "echo '{#}' 1>&2
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 f='{}'
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 g=$s/orig/warc/\${f##*/}
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 echo \|\$f\|\$g\| 1>&2
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 if [ ! -f \$g ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76 "
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 done
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78 done