158
|
1 # courtesy wwaites, yossi
|
|
2 # Usage: getcc_multi.aws [archive, e.g. CC-MAIN-2019-35] [-p nthreads] [file listing segment numbers]
|
|
3 ARCHIVE="$1"
|
|
4 shift
|
|
5 if [ "$1" = -p ]
|
|
6 then
|
|
7 shift
|
|
8 nthreads=$1
|
|
9 shift
|
|
10 else
|
|
11 nthreads=2
|
|
12 fi
|
|
13 SEGS="${1-all_segments}"
|
|
14
|
|
15 wf=warc.paths
|
|
16
|
|
17 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
|
|
18
|
|
19 mkdir -p /beegfs/common_crawl/${ARCHIVE}
|
|
20 cd /beegfs/common_crawl/${ARCHIVE}
|
|
21
|
|
22 if [ ! -f $wf ]
|
|
23 then
|
|
24 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
|
|
25 fi
|
|
26
|
|
27 if [ ! -f $SEGS ]
|
|
28 then
|
|
29 cut -f 4 -d / $wf |uniq > all_segments
|
|
30 SEGS=all_segments
|
|
31 fi
|
|
32
|
|
33 if [ ! -s "$(ls segment_* | head -1)" ]
|
|
34 then
|
|
35 n=$(cat $SEGS | wc -l)
|
|
36 m=$((n / 8))
|
|
37 split -n l/$m $SEGS segment_
|
|
38 fi
|
|
39
|
|
40 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
|
|
41 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
|
|
42 #export PASSPHRASE="annebooththompson"
|
|
43 #export AWS_RETRY_MODE=adaptive
|
|
44 #export AWS_MAX_ATTEMPTS=100
|
|
45 # Set these in ~/.aws/credentials as follows
|
|
46 # [hst]
|
|
47 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
|
|
48 # aws_secret_access_key = WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu
|
|
49 # And these in ~/.aws/config as follows
|
|
50 # [profile hst]
|
|
51 # retry_mode = adaptive
|
|
52 # max_attempts = 100
|
|
53 # s3 =
|
|
54 # multipart_threshold = 4GB
|
|
55 # max_concurrent_requests = 1
|
|
56 # multipart_chunksize = 32MB
|
|
57
|
|
58 echo args $ARCHIVE $nthreads 1>&2
|
|
59 for sf in segment_*
|
|
60 do
|
|
61 echo sf $sf
|
|
62 for s in $(cat $sf)
|
|
63 do
|
|
64 echo s $s
|
|
65 mkdir -p $s
|
|
66 fgrep -w $s $wf | tee >( echo nf $(wc -l) 1>&2 ) |\
|
|
67 parallel --will-cite -j $nthreads \
|
|
68 "echo '{#}' 1>&2
|
|
69 f='{}'
|
|
70 g=$s/orig/warc/\${f##*/}
|
|
71 echo \|\$f\|\$g\| 1>&2
|
|
72 if [ ! -f \$g ]
|
|
73 then
|
|
74 aws s3 cp s3://commoncrawl/\$f \$g --only-show-errors 2> >( { echo \$(date +%D:%T) \$g ; cat ; } >>m_errlog )
|
|
75 fi
|
|
76 "
|
|
77 done
|
|
78 done
|