Mercurial > hg > cc > pub
changeset 5:e265fcc42974
s3 use
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 23 May 2024 15:00:40 +0100 |
parents | 268fe5fd117f |
children | cc5cef8ba548 |
files | eidf125_example.sh |
diffstat | 1 files changed, 17 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eidf125_example.sh Thu May 23 15:00:40 2024 +0100 @@ -0,0 +1,17 @@ +# Illustrates the use of eidf125 data to find a particular archived page from CC-MAIN-2019-35 +# Cut-and-paste a bit at a time so you can see the outputs +aws s3 ls s3://eidf125-cc-main-2019-35-augmented-index --no-sign-request --endpoint-url https://s3.eidf.ac.uk/ +aws s3 cp s3://eidf125-cc-main-2019-35-augmented-index/cluster.idx --no-sign-request --endpoint-url https://s3.eidf.ac.uk/ . +grep -E -n '^uk,ac,ed' cluster.idx |wc -l +grep -E -n '^uk,ac,ed,inf' cluster.idx |wc -l +grep -E -n '^uk,ac,ed,inf' cluster.idx +aws s3api get-object --bucket eidf125-cc-main-2019-35-augmented-index --key idx/cdx-00289.gz --range bytes=88951112-$((88951112 + 181085 - 1)) --no-sign-request --endpoint-url https://s3.eidf.ac.uk/ block.gz +ls -l block.gz +zcat block.gz|less +zgrep ')/people/staff/.*lastmod' block.gz | sed 's/ .*lastmod/ /' | sort -k2r +date --date=@1533112224 +zgrep Plotkin block.gz +curl --range 847252902-$((847252902 + 3179 - 1)) -o plotkin.gz https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-35/segments/1566027313803.9/warc/CC-MAIN-20190818104019-20190818130019-00233.warc.gz +zcat plotkin.gz |less +# If you have the relevant credentials, this will also work: +# aws s3api get-object --bucket commoncrawl --key crawl-data/CC-MAIN-2019-35/segments/1566027313803.9/warc/CC-MAIN-20190818104019-20190818130019-00233.warc.gz --range bytes=847252902-$((847252902 + 3179 - 1)) plotkin.gz