Mercurial > hg > cc > cirrus_work
annotate bin/getidx.aws @ 249:87a35540104b
time the unpickling
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 18:35:08 +0000 |
parents | 56825fc8459d |
children |
rev | line source |
---|---|
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 # courtesy wwaites |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 ARCHIVE="$1" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 shift |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 if [ "$1" ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 wait="; sleep $1" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 cf=cc-index.paths |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 CCs=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/cc-index.paths.gz |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 mkdir -p /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 cd /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 if [ ! -f $cf ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 curl --retry 4 -s ${CCs} | gzip -dc > $cf |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 # n=$(cat $SEGS | wc -l) |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 # m=$((n / 8)) |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 # split -n l/$m $SEGS segment_ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 # Export some ENV variables so you don't have to type anything |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 export PASSPHRASE="annebooththompson" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 mkdir -p cdx/warc |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 cd cdx/warc |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 cat ../../$cf|\ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 parallel --will-cite -j 5 \ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 "f='{}' && aws s3 cp s3://commoncrawl/\$f \${f##*/} --only-show-errors 2> >( { echo \$(date +%D:%T) '{}' ; cat ; } >>ierrlog) $wait" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 |