annotate bin/getcc.aws @ 155:56825fc8459d

moved from /beegfs/common-crawl to get under .hg
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 11 Oct 2023 12:51:06 +0100
parents
children 463fc7b09119
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # courtesy wwaites
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 ARCHIVE="$1"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 shift
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 if [ "$1" ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 wait="; sleep $1"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 shift
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 SEGS="${1-all_segments}"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 wf=warc.paths
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 mkdir -p /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 cd /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 if [ ! -f $wf ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 if [ ! -f all_segments ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 cut -f 4 -d / $wf |uniq > all_segments
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 if [ ! -s "$(ls segment_* | head -1)" ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 n=$(cat $SEGS | wc -l)
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 m=$((n / 8))
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 split -n l/$m $SEGS segment_
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 export PASSPHRASE="annebooththompson"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 mkdir -p cdx/warc
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 cd cdx/warc
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 cat ../../$cf|\
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 for sf in segment_*
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 do
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 for s in $(cat $sf)
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 do
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 mkdir -p $s
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 parallel --colsep '\t' --will-cite -j 8 \
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 done
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 done