annotate bin/getcc_multi.aws @ 161:d0dbfefd6fc0

forget parallel, just do (default 2) parallel single threads
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 25 Oct 2023 23:01:59 +0100
parents 5d1c3359e210
children 348f4a31228f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
161
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
1 # courtesy wwaites
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
2 # Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads]
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
3 # Single segment, multiple threads
158
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 ARCHIVE="$1"
161
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
5 SEG=$2
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
6 nthreads=${3:-2}
158
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 wf=warc.paths
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 mkdir -p /beegfs/common_crawl/${ARCHIVE}
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 cd /beegfs/common_crawl/${ARCHIVE}
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 if [ ! -f $wf ]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 then
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 fi
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 #export PASSPHRASE="annebooththompson"
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 #export AWS_RETRY_MODE=adaptive
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 #export AWS_MAX_ATTEMPTS=100
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 # Set these in ~/.aws/credentials as follows
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 # [hst]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 # aws_secret_access_key = WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 # And these in ~/.aws/config as follows
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 # [profile hst]
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 # retry_mode = adaptive
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 # max_attempts = 100
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 # s3 =
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 # multipart_threshold = 4GB
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 # max_concurrent_requests = 1
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 # multipart_chunksize = 32MB
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
161
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
39 mkdir -p $s/orig/warc
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
41
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
43
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
44 seq 1 $nthreads | while read i
158
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 do
161
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
46 cat /tmp/hst/${s}_$i | while read f
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
47 do
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
48 g=$s/orig/warc/${f##*/}
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
49 if [ ! -f "$g" ]
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
50 then
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
52 fi
d0dbfefd6fc0 forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 158
diff changeset
53 done &
158
5d1c3359e210 resurrect parallel fetch
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 done