Mercurial > hg > cc > cirrus_home
annotate bin/bigpdf.sh @ 61:b24755311af8
x
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 23 Apr 2020 17:26:55 +0100 |
parents | ff4e85d8ec31 |
children | 346298ac3ab9 |
rev | line source |
---|---|
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Fetch big pdfs per segment from bigpdf_?.txt |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # First line thereof gives CC identifier |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
4 hn=$1 |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
5 echo $(date) $hn |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
6 tot () |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
7 { |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
8 awk '{sum+=$1} END {printf "%u\n",sum}' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
9 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 head -1 bigpdf_${hn}.txt |\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
11 { read cc |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 tail -n +2 bigpdf_${hn}.txt |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 while read s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 do |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 echo $(date) $(hostname) starting $s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 mkdir -p /dev/shm/pcrawl/$s/segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 cd /dev/shm/pcrawl/$s |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
18 if [ ! -f bu.txt ] |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
19 then |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
20 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
21 do |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
22 tar -xOf $f '*.hdr' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
23 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
24 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
25 done > bu.txt |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
26 fi |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
27 if [ "$(tot < nb.txt)" -ne\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
28 $((2 * $(wc -l < bu.txt))) ] |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 then |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 printf "length mismatch: tcount: %d != ucount: %d\n" \ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
31 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 exit 1 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 fi |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 export NUTCH_HEAPSIZE=20000 |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
35 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
38 -Dmapreduce.job.reduces=34 \ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
39 segments/* -threads 72 >/dev/null 2>&1 && \ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
40 echo $(date) $(hostname) finished $s && \ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
41 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
42 #rm -rf cdx segments warc |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 done |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
44 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 echo $(date) $(hostname) $? |