annotate bin/bigpdf.sh @ 58:4f31d3234620

try nutch fetch for big pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 22 Apr 2020 18:42:23 +0100
parents
children ff4e85d8ec31
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Fetch big pdfs per segment from bigpdf_?.txt
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # First line thereof gives CC identifier
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 echo $(date) $(hostname)
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 h=$(hostname)
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 hn=${h##*n}
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 head -1 bigpdf_${hn}.txt |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 { read cc ; \
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 tail -n +2 bigpdf_${hn}.txt |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 while read s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 do
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 echo $(date) $(hostname) starting $s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 mkdir -p /dev/shm/pcrawl/$s/segments
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 cd /dev/shm/pcrawl/$s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 do
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 tar -xOf $f '*.hdr' |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 done > bu.txt
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 if [ "$(tot < $b/nb.txt)" -ne\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 $((2 * $(wc -l < $b/bu.txt))) ]
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 then
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 printf "length mismatch: tcount: %d != ucount: %d\n" \
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 exit 1
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 fi
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 export NUTCH_HEAPSIZE=20000
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 segments/* -threads 36 >log 2>&1
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 echo $(date) $(hostname) finished $s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 done
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 echo $(date) $(hostname) $?