comparison bin/getcc_multi.aws @ 161:d0dbfefd6fc0

forget parallel, just do (default 2) parallel single threads
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 25 Oct 2023 23:01:59 +0100
parents 5d1c3359e210
children 348f4a31228f
comparison
equal deleted inserted replaced
160:afd2ece1fb22 161:d0dbfefd6fc0
1 # courtesy wwaites, yossi 1 # courtesy wwaites
2 # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers] 2 # Usage: getcc.aws <archive, e.g. CC-MAIN-2019-35> <segment ID, e.g. 68> [nthreads]
3 # Single segment, multiple threads
3 ARCHIVE="$1" 4 ARCHIVE="$1"
4 shift 5 SEG=$2
5 if [ "$1" = -p ] 6 nthreads=${3:-2}
6 then
7 shift
8 nthreads=$1
9 shift
10 else
11 nthreads=2
12 fi
13 SEGS="${1-all_segments}"
14 7
15 wf=warc.paths 8 wf=warc.paths
16 9
17 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz 10 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
18 11
20 cd /beegfs/common_crawl/${ARCHIVE} 13 cd /beegfs/common_crawl/${ARCHIVE}
21 14
22 if [ ! -f $wf ] 15 if [ ! -f $wf ]
23 then 16 then
24 curl --retry 4 -s ${WARCS} | gzip -dc > $wf 17 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
25 fi
26
27 if [ ! -f $SEGS ]
28 then
29 cut -f 4 -d / $wf |uniq > all_segments
30 SEGS=all_segments
31 fi
32
33 if [ ! -s "$(ls segment_* | head -1)" ]
34 then
35 n=$(cat $SEGS | wc -l)
36 m=$((n / 8))
37 split -n l/$m $SEGS segment_
38 fi 18 fi
39 19
40 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" 20 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
41 #export AWS_SECRET_ACCESS_KEY="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" 21 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
42 #export PASSPHRASE="annebooththompson" 22 #export PASSPHRASE="annebooththompson"
53 # s3 = 33 # s3 =
54 # multipart_threshold = 4GB 34 # multipart_threshold = 4GB
55 # max_concurrent_requests = 1 35 # max_concurrent_requests = 1
56 # multipart_chunksize = 32MB 36 # multipart_chunksize = 32MB
57 37
58 echo args $ARCHIVE $nthreads 1>&2 38 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
59 for sf in segment_* 39 mkdir -p $s/orig/warc
40 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz) <(fgrep -w $s warc.paths) > /tmp/hst/$s
41
42 split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
43
44 seq 1 $nthreads | while read i
60 do 45 do
61 echo sf $sf 46 cat /tmp/hst/${s}_$i | while read f
62 for s in $(cat $sf) 47 do
63 do 48 g=$s/orig/warc/${f##*/}
64 echo s $s 49 if [ ! -f "$g" ]
65 mkdir -p $s 50 then
66 fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\ 51 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_${SEG}_$i )
67 parallel --will-cite -j $nthreads \ 52 fi
68 "echo '{#}' 1>&2 53 done &
69 f='{}'
70 g=$s/orig/warc/\${f##*/}
71 echo \|\$f\|\$g\| 1>&2
72 if [ ! -f \$g ]
73 then
74 aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
75 fi
76 "
77 done
78 done 54 done