Mercurial > hg > cc > cirrus_work
annotate bin/getcc_multi.aws @ 161:d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Oct 2023 23:01:59 +0100 |
parents | 5d1c3359e210 |
children | 348f4a31228f |
rev | line source |
---|---|
161
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
1 # courtesy wwaites |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
2 # Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads] |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
3 # Single segment, multiple threads |
158 | 4 ARCHIVE="$1" |
161
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
5 SEG=$2 |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
6 nthreads=${3:-2} |
158 | 7 |
8 wf=warc.paths | |
9 | |
10 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz | |
11 | |
12 mkdir -p /beegfs/common_crawl/${ARCHIVE} | |
13 cd /beegfs/common_crawl/${ARCHIVE} | |
14 | |
15 if [ ! -f $wf ] | |
16 then | |
17 curl --retry 4 -s ${WARCS} | gzip -dc > $wf | |
18 fi | |
19 | |
20 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | |
21 #export AWS_SECRET_ACCESS_KEY="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" | |
22 #export PASSPHRASE="annebooththompson" | |
23 #export AWS_RETRY_MODE=adaptive | |
24 #export AWS_MAX_ATTEMPTS=100 | |
25 # Set these in ~/.aws/credentials as follows | |
26 # [hst] | |
27 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA | |
28 # aws_secret_access_key = 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 | |
29 # And these in ~/.aws/config as follows | |
30 # [profile hst] | |
31 # retry_mode = adaptive | |
32 # max_attempts = 100 | |
33 # s3 = | |
34 # multipart_threshold = 4GB | |
35 # max_concurrent_requests = 1 | |
36 # multipart_chunksize = 32MB | |
37 | |
161
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
39 mkdir -p $s/orig/warc |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
41 |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
43 |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
44 seq 1 $nthreads | while read i |
158 | 45 do |
161
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
46 cat /tmp/hst/${s}_$i | while read f |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
47 do |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
48 g=$s/orig/warc/${f##*/} |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
49 if [ ! -f "$g" ] |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
50 then |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
52 fi |
d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
158
diff
changeset
|
53 done & |
158 | 54 done |