Mercurial > hg > cc > cirrus_home
annotate bin/doPlinks.sh @ 9:7a93e190c74d
logging tweaks, preparing for timeout on problem pdfs
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 25 Feb 2020 10:34:41 +0000 |
parents | 3b56c2c9d0ee |
children | b0d9fe66ce8a |
rev | line source |
---|---|
0 | 1 #!/usr/bin/bash |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
2 hn=$1 |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
3 jn=$2 |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
4 tfn=$3 |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
5 |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
6 mkdir -p /dev/shm/x$hn/${tfn} |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
7 cd /dev/shm/x$hn/${tfn} |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
8 tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.pdf' |
7 | 9 echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
10 ls *.pdf | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\ |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
11 plinks.py $tfn || { echo $(date) $hn aborted job $jn for $tfn \[remember to clean up\] ; rm -f /dev/shm/stopJob ; exit 1 ; } |
7 | 12 echo $(date) $hn tarring $(ls badpdfs_*|wc -l)/$(ls links_*_*|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
13 tar -cf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/links/${tfn}.tar badpdfs_${tfn} links_${tfn}_* |
9
7a93e190c74d
logging tweaks, preparing for timeout on problem pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
8
diff
changeset
|
14 echo $(date) $(pwd) rm $(ls -lt badpdfs_*) 1>&2 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
15 rm * |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
16 cd .. |
8
3b56c2c9d0ee
longer run, terser logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
7
diff
changeset
|
17 echo $(date) $(pwd) rmdir ${tfn} 1>&2 |
6
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
18 rmdir ${tfn} |
0f494c76a887
refactor to address tarred-up pdfs
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
1
diff
changeset
|
19 echo $(date) $hn finished job ${jn} for ${tfn} |