annotate bin/getidx.aws @ 249:87a35540104b

time the unpickling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Jan 2025 18:35:08 +0000
parents 56825fc8459d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # courtesy wwaites
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 ARCHIVE="$1"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 shift
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 if [ "$1" ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 wait="; sleep $1"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 cf=cc-index.paths
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 mkdir -p /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 cd /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 if [ ! -f $cf ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 curl --retry 4 -s ${CCs} | gzip -dc > $cf
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 # n=$(cat $SEGS | wc -l)
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 # m=$((n / 8))
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 # split -n l/$m $SEGS segment_
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 # Export some ENV variables so you don't have to type anything
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 export PASSPHRASE="annebooththompson"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 mkdir -p cdx/warc
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 cd cdx/warc
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 cat ../../$cf|\
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 parallel --will-cite -j 5 \
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait"
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35