Mercurial > hg > cc > cirrus_work
comparison bin/getcc_multi.aws @ 158:5d1c3359e210
resurrect parallel fetch
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 24 Oct 2023 16:58:44 +0100 |
parents | |
children | d0dbfefd6fc0 |
comparison
equal
deleted
inserted
replaced
157:463fc7b09119 | 158:5d1c3359e210 |
---|---|
1 # courtesy wwaites, yossi | |
2 # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] | |
3 ARCHIVE="$1" | |
4 shift | |
5 if [ "$1" = -p ] | |
6 then | |
7 shift | |
8 nthreads=$1 | |
9 shift | |
10 else | |
11 nthreads=2 | |
12 fi | |
13 SEGS="${1-all_segments}" | |
14 | |
15 wf=warc.paths | |
16 | |
17 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz | |
18 | |
19 mkdir -p /beegfs/common_crawl/${ARCHIVE} | |
20 cd /beegfs/common_crawl/${ARCHIVE} | |
21 | |
22 if [ ! -f $wf ] | |
23 then | |
24 curl --retry 4 -s ${WARCS} | gzip -dc > $wf | |
25 fi | |
26 | |
27 if [ ! -f $SEGS ] | |
28 then | |
29 cut -f 4 -d / $wf |uniq > all_segments | |
30 SEGS=all_segments | |
31 fi | |
32 | |
33 if [ ! -s "$(ls segment_* | head -1)" ] | |
34 then | |
35 n=$(cat $SEGS | wc -l) | |
36 m=$((n / 8)) | |
37 split -n l/$m $SEGS segment_ | |
38 fi | |
39 | |
40 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | |
41 #export AWS_SECRET_ACCESS_KEY="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" | |
42 #export PASSPHRASE="annebooththompson" | |
43 #export AWS_RETRY_MODE=adaptive | |
44 #export AWS_MAX_ATTEMPTS=100 | |
45 # Set these in ~/.aws/credentials as follows | |
46 # [hst] | |
47 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA | |
48 # aws_secret_access_key = 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 | |
49 # And these in ~/.aws/config as follows | |
50 # [profile hst] | |
51 # retry_mode = adaptive | |
52 # max_attempts = 100 | |
53 # s3 = | |
54 # multipart_threshold = 4GB | |
55 # max_concurrent_requests = 1 | |
56 # multipart_chunksize = 32MB | |
57 | |
58 echo args $ARCHIVE $nthreads 1>&2 | |
59 for sf in segment_* | |
60 do | |
61 echo sf $sf | |
62 for s in $(cat $sf) | |
63 do | |
64 echo s $s | |
65 mkdir -p $s | |
66 fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ | |
67 parallel --will-cite -j $nthreads \ | |
68 "echo '{#}' 1>&2 | |
69 f='{}' | |
70 g=$s/orig/warc/\${f##*/} | |
71 echo \|\$f\|\$g\| 1>&2 | |
72 if [ ! -f \$g ] | |
73 then | |
74 aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog ) | |
75 fi | |
76 " | |
77 done | |
78 done |