Mercurial > hg > cc > cirrus_home
comparison bin/bigpdf.sh @ 58:4f31d3234620
try nutch fetch for big pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 22 Apr 2020 18:42:23 +0100 |
parents | |
children | ff4e85d8ec31 |
comparison
equal
deleted
inserted
replaced
57:849ccd30258d | 58:4f31d3234620 |
---|---|
1 #!/usr/bin/bash | |
2 # Fetch big pdfs per segment from bigpdf_?.txt | |
3 # First line thereof gives CC identifier | |
4 echo $(date) $(hostname) | |
5 h=$(hostname) | |
6 hn=${h##*n} | |
7 head -1 bigpdf_${hn}.txt |\ | |
8 { read cc ; \ | |
9 tail -n +2 bigpdf_${hn}.txt |\ | |
10 while read s | |
11 do | |
12 echo $(date) $(hostname) starting $s | |
13 mkdir -p /dev/shm/pcrawl/$s/segments | |
14 cd /dev/shm/pcrawl/$s | |
15 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar | |
16 do | |
17 tar -xOf $f '*.hdr' |\ | |
18 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ | |
19 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' | |
20 done > bu.txt | |
21 if [ "$(tot < $b/nb.txt)" -ne\ | |
22 $((2 * $(wc -l < $b/bu.txt))) ] | |
23 then | |
24 printf "length mismatch: tcount: %d != ucount: %d\n" \ | |
25 $(tot < $b/nb.txt) $(wc -l < $b/bu.txt) 1>&2 | |
26 exit 1 | |
27 fi | |
28 export NUTCH_HEAPSIZE=20000 | |
29 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments | |
30 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ | |
31 -Dmapreduce.job.reduces=34 -Dhadoop.log.dir=/dev/shm/pcrawl/$s\ | |
32 segments/* -threads 36 >log 2>&1 | |
33 echo $(date) $(hostname) finished $s | |
34 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s | |
35 done | |
36 echo $(date) $(hostname) $? |