annotate bin/bigpdf.sh @ 93:4d870a7ec871

support a command to receive each result, remove use of X-Crawler-Content-Length
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Apr 2021 10:59:25 +0000
parents f1bf3effa893
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Fetch big pdfs per segment from bigpdf_?.txt
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # First line thereof gives CC identifier
62
346298ac3ab9 several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 60
diff changeset
4 cd /dev/shm
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
5 hn=$1
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
6 echo $(date) $hn
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
7 tot ()
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
8 {
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
9 awk '{sum+=$1} END {printf "%u\n",sum}'
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
10 }
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 head -1 bigpdf_${hn}.txt |\
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
12 { read cc
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 tail -n +2 bigpdf_${hn}.txt |\
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 while read s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 do
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 echo $(date) $(hostname) starting $s
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 mkdir -p /dev/shm/pcrawl/$s/segments
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 cd /dev/shm/pcrawl/$s
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
19 if [ ! -f bu.txt ]
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
20 then
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
21 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
22 do
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
23 tar -xOf $f '*.hdr' |\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
24 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
25 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' '
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
26 done > bu.txt
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
27 fi
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
28 if [ "$(tot < nb.txt)" -ne\
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
29 $((2 * $(wc -l < bu.txt))) ]
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 then
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 printf "length mismatch: tcount: %d != ucount: %d\n" \
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 exit 1
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 fi
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 export NUTCH_HEAPSIZE=20000
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s
72
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
37 if [ ! -f segments/2020*/crawl_generate/part-r-00000 ]
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
38 then
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
39 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
40 fi
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
42 -Dmapreduce.job.reduces=34 \
62
346298ac3ab9 several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 60
diff changeset
43 segments/* -threads 144 >/dev/null 2>&1 && \
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
44 echo $(date) $(hostname) finished $s && \
62
346298ac3ab9 several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 60
diff changeset
45 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \
72
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
46 { for f in warc/warc/*.gz
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
47 do
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
48 unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: '
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
49 done > truncated.txt
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
50 cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
51 rm -rf cdx segments warc
f1bf3effa893 log trucations
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 65
diff changeset
52 }
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 done
60
ff4e85d8ec31 switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 58
diff changeset
54 }
58
4f31d3234620 try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55 echo $(date) $(hostname) $?