Mercurial > hg > cc > cirrus_work
comparison bin/getcc.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children | 463fc7b09119 |
comparison
equal
deleted
inserted
replaced
154:5d30cd8c6254 | 155:56825fc8459d |
---|---|
1 # courtesy wwaites | |
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] | |
3 ARCHIVE="$1" | |
4 shift | |
5 if [ "$1" ] | |
6 then | |
7 wait="; sleep $1" | |
8 shift | |
9 fi | |
10 SEGS="${1-all_segments}" | |
11 | |
12 wf=warc.paths | |
13 | |
14 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz | |
15 | |
16 mkdir -p /beegfs/common_crawl/${ARCHIVE} | |
17 cd /beegfs/common_crawl/${ARCHIVE} | |
18 | |
19 if [ ! -f $wf ] | |
20 then | |
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf | |
22 fi | |
23 | |
24 if [ ! -f all_segments ] | |
25 then | |
26 cut -f 4 -d / $wf |uniq > all_segments | |
27 fi | |
28 | |
29 if [ ! -s "$(ls segment_* | head -1)" ] | |
30 then | |
31 n=$(cat $SEGS | wc -l) | |
32 m=$((n / 8)) | |
33 split -n l/$m $SEGS segment_ | |
34 fi | |
35 | |
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" | |
37 export AWS_SECRET_ACCESS_KEY="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" | |
38 export PASSPHRASE="annebooththompson" | |
39 | |
40 mkdir -p cdx/warc | |
41 cd cdx/warc | |
42 cat ../../$cf|\ | |
43 | |
44 for sf in segment_* | |
45 do | |
46 for s in $(cat $sf) | |
47 do | |
48 mkdir -p $s | |
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ | |
50 parallel --colsep '\t' --will-cite -j 8 \ | |
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" | |
52 done | |
53 done |