Mercurial > hg > cc > cirrus_home
annotate bin/bigpdf.sh @ 63:d39fd9c7f1be
misc
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 19:55:11 +0100 |
parents | 346298ac3ab9 |
children | e71aeb3355ff |
rev | line source |
---|---|
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Fetch big pdfs per segment from bigpdf_?.txt |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # First line thereof gives CC identifier |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
4 cd /dev/shm |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
5 hn=$1 |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
6 echo $(date) $hn |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
7 tot () |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
8 { |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
9 awk '{sum+=$1} END {printf "%u\n",sum}' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
10 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 head -1 bigpdf_${hn}.txt |\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
12 { read cc |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 tail -n +2 bigpdf_${hn}.txt |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 while read s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 do |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 echo $(date) $(hostname) starting $s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 mkdir -p /dev/shm/pcrawl/$s/segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 cd /dev/shm/pcrawl/$s |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
19 if [ ! -f bu.txt ] |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
20 then |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
21 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
22 do |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
23 tar -xOf $f '*.hdr' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
24 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
25 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
26 done > bu.txt |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
27 fi |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
28 if [ "$(tot < nb.txt)" -ne\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
29 $((2 * $(wc -l < bu.txt))) ] |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 then |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 printf "length mismatch: tcount: %d != ucount: %d\n" \ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 exit 1 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 fi |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 export NUTCH_HEAPSIZE=20000 |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.reduces=1 bu.txt segments |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
39 -Dmapreduce.job.reduces=34 \ |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
40 segments/* -threads 144 >/dev/null 2>&1 && \ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
41 echo $(date) $(hostname) finished $s && \ |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
42 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ |
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
43 rm -rf cdx segments warc |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 done |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
45 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 echo $(date) $(hostname) $? |