comparison bin/bigpdf.sh @ 58:4f31d3234620

try nutch fetch for big pdfs
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 22 Apr 2020 18:42:23 +0100
parents
children ff4e85d8ec31
comparison
equal deleted inserted replaced
57:849ccd30258d 58:4f31d3234620
1 #!/usr/bin/bash
2 # Fetch big pdfs per segment from bigpdf_?.txt
3 # First line thereof gives CC identifier
4 echo $(date) $(hostname)
5 h=$(hostname)
6 hn=${h##*n}
7 head -1 bigpdf_${hn}.txt |\
8 { read cc ; \
9 tail -n +2 bigpdf_${hn}.txt |\
10 while read s
11 do
12 echo $(date) $(hostname) starting $s
13 mkdir -p /dev/shm/pcrawl/$s/segments
14 cd /dev/shm/pcrawl/$s
15 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
16 do
17 tar -xOf $f '*.hdr' |\
18 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
19 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
20 done > bu.txt
21 if [ "$(tot < $b/nb.txt)" -ne\
22 $((2 * $(wc -l < $b/bu.txt))) ]
23 then
24 printf "length mismatch: tcount: %d != ucount: %d\n" \
25 $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2
26 exit 1
27 fi
28 export NUTCH_HEAPSIZE=20000
29 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
30 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
31 -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\
32 segments/* -threads 36 >log 2>&1
33 echo $(date) $(hostname) finished $s
34 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s
35 done
36 echo $(date) $(hostname) $?