Mercurial > hg > cc > cirrus_home
annotate bin/bigpdf.sh @ 175:d123ef7fdb82
working on implementing types and parts:
1, 2, 4 working, 3 not
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 03 Jul 2023 18:16:14 +0100 |
parents | f1bf3effa893 |
children |
rev | line source |
---|---|
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Fetch big pdfs per segment from bigpdf_?.txt |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # First line thereof gives CC identifier |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
4 cd /dev/shm |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
5 hn=$1 |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
6 echo $(date) $hn |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
7 tot () |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
8 { |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
9 awk '{sum+=$1} END {printf "%u\n",sum}' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
10 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 head -1 bigpdf_${hn}.txt |\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
12 { read cc |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 tail -n +2 bigpdf_${hn}.txt |\ |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 while read s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 do |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 echo $(date) $(hostname) starting $s |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 mkdir -p /dev/shm/pcrawl/$s/segments |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 cd /dev/shm/pcrawl/$s |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
19 if [ ! -f bu.txt ] |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
20 then |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
21 for f in /beegfs/common_crawl/CC-MAIN-$cc/$s/extract_*.tar |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
22 do |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
23 tar -xOf $f '*.hdr' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
24 fgrep -aA1 --no-group-separator 'X-HST-Truncated: length' |\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
25 tee >(wc -l >>nb.txt) | fgrep X-HST-Target-URI | cut -f 2 -d ' ' |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
26 done > bu.txt |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
27 fi |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
28 if [ "$(tot < nb.txt)" -ne\ |
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
29 $((2 * $(wc -l < bu.txt))) ] |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 then |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 printf "length mismatch: tcount: %d != ucount: %d\n" \ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
32 $(tot < nb.txt) $(wc -l < bu.txt) 1>&2 |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 exit 1 |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 fi |
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 export NUTCH_HEAPSIZE=20000 |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
36 export NUTCH_LOG_DIR=/dev/shm/pcrawl/$s |
72 | 37 if [ ! -f segments/2020*/crawl_generate/part-r-00000 ] |
38 then | |
39 ${HOME}/src/nutch-cc/runtime/local/bin/nutch freegen -Dmapreduce.job.maps=1 bu.txt segments | |
40 fi | |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 ${HOME}/src/nutch-cc/runtime/local/bin/nutch fetch\ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
42 -Dmapreduce.job.reduces=34 \ |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
43 segments/* -threads 144 >/dev/null 2>&1 && \ |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
44 echo $(date) $(hostname) finished $s && \ |
62
346298ac3ab9
several efficiency (hofentlich) tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
60
diff
changeset
|
45 cp -a * /beegfs/common_crawl/CC-MAIN-$cc/$s && \ |
72 | 46 { for f in warc/warc/*.gz |
47 do | |
48 unpigz -dp 1 -c $f | egrep -aiB15 '^WARC-Truncated: ' | |
49 done > truncated.txt | |
50 cp -a truncated.txt /beegfs/common_crawl/CC-MAIN-$cc/$s/warc/warc | |
51 rm -rf cdx segments warc | |
52 } | |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 done |
60
ff4e85d8ec31
switch for use on login server, invoke by hand with 0/1 as only cmd line arg
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
58
diff
changeset
|
54 } |
58
4f31d3234620
try nutch fetch for big pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
55 echo $(date) $(hostname) $? |