annotate eidf125_example.sh @ 6:cc5cef8ba548 default tip

expanded with example script, updated to point to full paper, include slides
author Henry Thompson <ht@markup.co.uk>
date Thu, 23 May 2024 16:51:36 +0200
parents e265fcc42974
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 # Illustrates the use of eidf125 data to find a particular archived page from CC-MAIN-2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Cut-and-paste a bit at a time so you can see the outputs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 aws s3 ls s3://eidf125-cc-main-2019-35-augmented-index --no-sign-request --endpoint-url https://s3.eidf.ac.uk/
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 aws s3 cp s3://eidf125-cc-main-2019-35-augmented-index/cluster.idx --no-sign-request --endpoint-url https://s3.eidf.ac.uk/ .
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 grep -E -n '^uk,ac,ed' cluster.idx |wc -l
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 grep -E -n '^uk,ac,ed,inf' cluster.idx |wc -l
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 grep -E -n '^uk,ac,ed,inf' cluster.idx
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 aws s3api get-object --bucket eidf125-cc-main-2019-35-augmented-index --key idx/cdx-00289.gz --range bytes=88951112-$((88951112 + 181085 - 1)) --no-sign-request --endpoint-url https://s3.eidf.ac.uk/ block.gz
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 ls -l block.gz
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 zcat block.gz|less
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 zgrep ')/people/staff/.*lastmod' block.gz | sed 's/ .*lastmod/ /' | sort -k2r
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 date --date=@1533112224
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 zgrep Plotkin block.gz
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 curl --range 847252902-$((847252902 + 3179 - 1)) -o plotkin.gz https://data.commoncrawl.org/crawl-data/CC-MAIN-2019-35/segments/1566027313803.9/warc/CC-MAIN-20190818104019-20190818130019-00233.warc.gz
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 zcat plotkin.gz |less
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 # If you have the relevant credentials, this will also work:
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 # aws s3api get-object --bucket commoncrawl --key crawl-data/CC-MAIN-2019-35/segments/1566027313803.9/warc/CC-MAIN-20190818104019-20190818130019-00233.warc.gz --range bytes=847252902-$((847252902 + 3179 - 1)) plotkin.gz