annotate bin/getcc.aws @ 223:36610ddfbc7a

replaced mean_lens by w or wo bogon
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 28 Feb 2024 14:49:45 +0000
parents afd2ece1fb22
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # courtesy wwaites
159
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
2 # Usage: getcc.aws [archive, e.g. CC-MAIN-2019-35] [segment ID, e.g. 68]
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
3 # Single segment, single thread
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 ARCHIVE="$1"
159
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
5 SEG="$2"
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 wf=warc.paths
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 WARCS=https://data.commoncrawl.org/crawl-data/${ARCHIVE}/${wf}.gz
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 mkdir -p /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 cd /beegfs/common_crawl/${ARCHIVE}
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 if [ ! -f $wf ]
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 then
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 curl --retry 4 -s ${WARCS} | gzip -dc > $wf
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 fi
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
157
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
19 #export AWS_ACCESS_KEY_ID="AKIAIKBLNO2XNVMWM5JA"
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
20 #export AWS_SECRET_ACCESS_KEY="WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu"
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
21 #export PASSPHRASE="annebooththompson"
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
22 #export AWS_RETRY_MODE=adaptive
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
23 #export AWS_MAX_ATTEMPTS=100
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
24 # Set these in ~/.aws/credentials as follows
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
25 # [hst]
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
26 # aws_access_key_id = AKIAIKBLNO2XNVMWM5JA
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
27 # aws_secret_access_key = WaH4SAhsrqqJ/GLo/jkw+u9ER1ny05e1W45sSgYu
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
28 # And these in ~/.aws/config as follows
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
29 # [profile hst]
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
30 # retry_mode = adaptive
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
31 # max_attempts = 100
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
32 # s3 =
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
33 # multipart_threshold = 4GB
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
34 # max_concurrent_requests = 1
463fc7b09119 convert to single thread,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 155
diff changeset
35 # multipart_chunksize = 32MB
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36
159
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
37 s=$(grep -Eow "[0-9]*\.$SEG" $wf | head -1)
160
afd2ece1fb22 add missing makedir
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 159
diff changeset
38 mkdir -p $s/orig/warc
159
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
39 fgrep -w $s $wf |\
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
40 while read f
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 do
159
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
42 g=$s/orig/warc/${f##*/}
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
43 if [ ! -f "$g" ]
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
44 then
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
45 aws s3 cp s3://commoncrawl/$f $g $(cat debug) --only-show-errors 2> >( { echo $(date +%D:%T) $f ; cat ; } >>errlog_$SEG )
ebff60e85c59 now does one named segment only
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 157
diff changeset
46 fi
155
56825fc8459d moved from /beegfs/common-crawl to get under .hg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 done