Mercurial > hg > cc > cirrus_home
annotate bin/bigpdf.sh @ 58:4f31d3234620
try nutch fetch for big pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 22 Apr 2020 18:42:23 +0100 |
parents | |
children | ff4e85d8ec31 |
rev | line source |
---|---|
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Fetch big pdfs per segment from bigpdf_?.txt |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # First line thereof gives CC identifier |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 echo $(date) $(hostname) |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 h=$(hostname) |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 hn=${h##*n} |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 head -1 bigpdf_${hn}.txt |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 { read cc ; \ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 tail -n +2 bigpdf_${hn}.txt |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 while read s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 do |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 echo $(date) $(hostname) starting $s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 mkdir -p /dev/shm/pcrawl/$s/segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 cd /dev/shm/pcrawl/$s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 do |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 tar -xOf $f '*.hdr' |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 done > bu.txt |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 if [ "$(tot < $b/nb.txt)" -ne\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 $((2 * $(wc -l < $b/bu.txt))) ] |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 then |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 printf "length mismatch: tcount: %d != ucount: %d\n" \ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 exit 1 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 fi |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 export NUTCH_HEAPSIZE=20000 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 segments/* -threads 36 >log 2>&1 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 echo $(date) $(hostname) finished $s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 done |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 echo $(date) $(hostname) $? |