comparison bin/getcc.aws @ 155:56825fc8459d

moved from /beegfs/common-crawl to get under .hg
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 11 Oct 2023 12:51:06 +0100
parents
children 463fc7b09119
comparison
equal deleted inserted replaced
154:5d30cd8c6254 155:56825fc8459d
1 # courtesy wwaites
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
3 ARCHIVE="$1"
4 shift
5 if [ "$1" ]
6 then
7 wait="; sleep $1"
8 shift
9 fi
10 SEGS="${1-all_segments}"
11
12 wf=warc.paths
13
14 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
15
16 mkdir -p /beegfs/common_crawl/${ARCHIVE}
17 cd /beegfs/common_crawl/${ARCHIVE}
18
19 if [ ! -f $wf ]
20 then
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
22 fi
23
24 if [ ! -f all_segments ]
25 then
26 cut -f 4 -d / $wf |uniq > all_segments
27 fi
28
29 if [ ! -s "$(ls segment_* | head -1)" ]
30 then
31 n=$(cat $SEGS | wc -l)
32 m=$((n / 8))
33 split -n l/$m $SEGS segment_
34 fi
35
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
37 export AWS_SECRET_ACCESS_KEY="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"
38 export PASSPHRASE="annebooththompson"
39
40 mkdir -p cdx/warc
41 cd cdx/warc
42 cat ../../$cf|\
43
44 for sf in segment_*
45 do
46 for s in $(cat $sf)
47 do
48 mkdir -p $s
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\
50 parallel --colsep '\t' --will-cite -j 8 \
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)"
52 done
53 done