Mercurial > hg > cc > cirrus_work
annotate bin/getcc.aws @ 155:56825fc8459d
moved from /beegfs/common-crawl to get under .hg
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 11 Oct 2023 12:51:06 +0100 |
parents | |
children | 463fc7b09119 |
rev | line source |
---|---|
155
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 # courtesy wwaites |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [file listing segment numbers] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 ARCHIVE="$1" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 shift |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 if [ "$1" ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 wait="; sleep $1" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 shift |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 SEGS="${1-all_segments}" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 wf=warc.paths |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 mkdir -p /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 cd /beegfs/common_crawl/${ARCHIVE} |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 if [ ! -f $wf ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 curl --retry 4 -s ${WARCS} | gzip -dc > $wf |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 if [ ! -f all_segments ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 cut -f 4 -d / $wf |uniq > all_segments |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 if [ ! -s "$(ls segment_* | head -1)" ] |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 then |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 n=$(cat $SEGS | wc -l) |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 m=$((n / 8)) |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 split -n l/$m $SEGS segment_ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 fi |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 export PASSPHRASE="annebooththompson" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 mkdir -p cdx/warc |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 cd cdx/warc |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 cat ../../$cf|\ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 for sf in segment_* |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 do |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 for s in $(cat $sf) |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 do |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 mkdir -p $s |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 fgrep -w $s $wf | while read c; do echo "$s$'\t'${c##*/}$'\t'$c"; done |\ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 parallel --colsep '\t' --will-cite -j 8 \ |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 "curl -sSo '{1}/{2}' aws s3 cp s3://commoncrawl/'{3}' 2> >( { echo \$(date +%D:%T) '{3}' ; cat ; } >>errlog)" |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 done |
56825fc8459d
moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 done |