Mercurial > hg > cc > cirrus_work
comparison bin/getidx.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
154:5d30cd8c6254 | 155:56825fc8459d |
---|---|
1 # courtesy wwaites | |
2 ARCHIVE="$1" | |
3 shift | |
4 if [ "$1" ] | |
5 then | |
6 wait="; sleep $1" | |
7 fi | |
8 | |
9 cf=cc-index.paths | |
10 | |
11 CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz | |
12 | |
13 mkdir -p /beegfs/common_crawl/${ARCHIVE} | |
14 cd /beegfs/common_crawl/${ARCHIVE} | |
15 | |
16 if [ ! -f $cf ] | |
17 then | |
18 curl --retry 4 -s ${CCs} | gzip -dc > $cf | |
19 fi | |
20 | |
21 # n=$(cat $SEGS | wc -l) | |
22 # m=$((n / 8)) | |
23 # split -n l/$m $SEGS segment_ | |
24 | |
25 # Export some ENV variables so you don't have to type anything | |
26 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | |
27 export AWS_SECRET_ACCESS_KEY="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" | |
28 export PASSPHRASE="annebooththompson" | |
29 | |
30 mkdir -p cdx/warc | |
31 cd cdx/warc | |
32 cat ../../$cf|\ | |
33 parallel --will-cite -j 5 \ | |
34 "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait" | |
35 |