Mercurial > hg > cc > cirrus_work
comparison bin/getcc_multi.aws @ 161:d0dbfefd6fc0
forget parallel, just do (default 2) parallel single threads
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Oct 2023 23:01:59 +0100 |
parents | 5d1c3359e210 |
children | 348f4a31228f |
comparison
equal
deleted
inserted
replaced
160:afd2ece1fb22 | 161:d0dbfefd6fc0 |
---|---|
1 # courtesy wwaites, yossi | 1 # courtesy wwaites |
2 # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] | 2 # Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads] |
3 # Single segment, multiple threads | |
3 ARCHIVE="$1" | 4 ARCHIVE="$1" |
4 shift | 5 SEG=$2 |
5 if [ "$1" = -p ] | 6 nthreads=${3:-2} |
6 then | |
7 shift | |
8 nthreads=$1 | |
9 shift | |
10 else | |
11 nthreads=2 | |
12 fi | |
13 SEGS="${1-all_segments}" | |
14 | 7 |
15 wf=warc.paths | 8 wf=warc.paths |
16 | 9 |
17 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz | 10 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz |
18 | 11 |
20 cd /beegfs/common_crawl/${ARCHIVE} | 13 cd /beegfs/common_crawl/${ARCHIVE} |
21 | 14 |
22 if [ ! -f $wf ] | 15 if [ ! -f $wf ] |
23 then | 16 then |
24 curl --retry 4 -s ${WARCS} | gzip -dc > $wf | 17 curl --retry 4 -s ${WARCS} | gzip -dc > $wf |
25 fi | |
26 | |
27 if [ ! -f $SEGS ] | |
28 then | |
29 cut -f 4 -d / $wf |uniq > all_segments | |
30 SEGS=all_segments | |
31 fi | |
32 | |
33 if [ ! -s "$(ls segment_* | head -1)" ] | |
34 then | |
35 n=$(cat $SEGS | wc -l) | |
36 m=$((n / 8)) | |
37 split -n l/$m $SEGS segment_ | |
38 fi | 18 fi |
39 | 19 |
40 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | 20 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" |
41 #export AWS_SECRET_ACCESS_KEY="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" | 21 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu" |
42 #export PASSPHRASE="annebooththompson" | 22 #export PASSPHRASE="annebooththompson" |
53 # s3 = | 33 # s3 = |
54 # multipart_threshold = 4GB | 34 # multipart_threshold = 4GB |
55 # max_concurrent_requests = 1 | 35 # max_concurrent_requests = 1 |
56 # multipart_chunksize = 32MB | 36 # multipart_chunksize = 32MB |
57 | 37 |
58 echo args $ARCHIVE $nthreads 1>&2 | 38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1) |
59 for sf in segment_* | 39 mkdir -p $s/orig/warc |
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s | |
41 | |
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ | |
43 | |
44 seq 1 $nthreads | while read i | |
60 do | 45 do |
61 echo sf $sf | 46 cat /tmp/hst/${s}_$i | while read f |
62 for s in $(cat $sf) | 47 do |
63 do | 48 g=$s/orig/warc/${f##*/} |
64 echo s $s | 49 if [ ! -f "$g" ] |
65 mkdir -p $s | 50 then |
66 fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ | 51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i ) |
67 parallel --will-cite -j $nthreads \ | 52 fi |
68 "echo '{#}' 1>&2 | 53 done & |
69 f='{}' | |
70 g=$s/orig/warc/\${f##*/} | |
71 echo \|\$f\|\$g\| 1>&2 | |
72 if [ ! -f \$g ] | |
73 then | |
74 aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog ) | |
75 fi | |
76 " | |
77 done | |
78 done | 54 done |