annotate bin/bigpdf.sh @ 61:b24755311af8

x
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 23 Apr 2020 17:26:55 +0100
parents ff4e85d8ec31
children 346298ac3ab9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Fetch big pdfs per segment from bigpdf_?.txt
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # First line thereof gives CC identifier
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
4 hn=$1
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
5 echo $(date) $hn
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
6 tot ()
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
7 {
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
8 awk '{sum+=$1} END {printf "%u\n",sum}'
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
9 }
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 head -1 bigpdf_${hn}.txt |\
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
11 { read cc
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 tail -n +2 bigpdf_${hn}.txt |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 while read s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 do
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 echo $(date) $(hostname) starting $s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 mkdir -p /dev/shm/pcrawl/$s/segments
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 cd /dev/shm/pcrawl/$s
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
18 if [ ! -f bu.txt ]
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
19 then
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
20 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
21 do
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
22 tar -xOf $f '*.hdr' |\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
23 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
24 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
25 done > bu.txt
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
26 fi
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
27 if [ "$(tot < nb.txt)" -ne\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
28 $((2 * $(wc -l < bu.txt))) ]
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 then
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 printf "length mismatch: tcount: %d != ucount: %d\n" \
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
31 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 exit 1
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 fi
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 export NUTCH_HEAPSIZE=20000
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
35 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen bu.txt segments
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
38 -Dmapreduce.job.reduces=34 \
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
39 segments/* -threads 72 >/dev/null 2>&1 && \
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
40 echo $(date) $(hostname) finished $s && \
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
41 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s #&& \
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
42 #rm -rf cdx segments warc
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 done
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
44 }
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 echo $(date) $(hostname) $?