changeset 14:9a1de2c4ffe3

modelled on plinks
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 26 Feb 2020 15:47:20 +0000
parents 8e3719800479
children a96fb2c26c80
files bin/doHdr.sh bin/hdr.sh bin/hdrMaster.sh hdrJob.sh
diffstat 4 files changed, 55 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doHdr.sh	Wed Feb 26 15:47:20 2020 +0000
@@ -0,0 +1,19 @@
+#!/usr/bin/bash
+hn=$1
+jn=$2
+tfn=$3
+
+mkdir -p /dev/shm/x$hn/${tfn}
+cd /dev/shm/x$hn/${tfn}
+tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.hdr'
+echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2
+ls *.hdr | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
+while read n f; do echo "$n\t"$(egrep -a '^X-HST-Target-URI: ' $f|cut -f 2 -d ' ')"\t"$(egrep -ia '^Last-Modified: ' $f|cut -f 2 -d ' '); done >> hdrs_$tfn.tsv
+echo $(date) $hn moving $(ls hdrs_*.tsv|wc -l) results from job $jn for $tfn in $(pwd) 1>&2
+mv hdrs_$tfn.tsv /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/hdrs
+echo $(date) $(pwd) rm $(ls -lt hdrs_*.tsv) 1>&2
+rm *
+cd ..
+echo $(date) $(pwd) rmdir ${tfn} 1>&2
+rmdir ${tfn}
+echo $(date) $hn finished job ${jn} for ${tfn}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/hdr.sh	Wed Feb 26 15:47:20 2020 +0000
@@ -0,0 +1,10 @@
+#!/usr/bin/bash
+echo $(date) $(hostname)
+h=$(hostname)
+hn=${h##*n}
+if [ $hn -eq 0 ]
+then seq --format="%03.0f" $1 $2
+else seq --format="%03.0f" $3 $4
+fi |\
+parallel --will-cite -j 48 -N 1 bin/doHdr.sh ${hn} '{#}' '{}'
+echo $(date) $(hostname) $?
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/hdrMaster.sh	Wed Feb 26 15:47:20 2020 +0000
@@ -0,0 +1,5 @@
+#!/bin/bash
+# This runs on 1 machine to launch the real job on two machines
+echo $(date) Launching hdr workers for "$@"
+parallel --will-cite --nonall -S r1i5n0 -S r1i5n1 bin/hdr.sh "$@"
+echo $(date) workers done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hdrJob.sh	Wed Feb 26 15:47:20 2020 +0000
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Usage: qsub -v t1=1stTar,tn=numTars plinksJob.sh
+#PBS -l select=2:ncpus=36
+#PBS -l place=exclhost
+#PBS -l walltime=08:00:00
+#PBS -V
+#PBS -A dc007
+#PBS -N plinks
+
+#module load mpt
+
+cd ${PBS_O_WORKDIR}
+((n1=tn/2))
+((s1=t1))
+((s2=t1+n1))
+((e1=s2-1))
+((e2=s1+tn-1))
+echo $(seq --format="%03.0f" $s1 $e1),$(seq --format="%03.0f" $s2 $e2)
+
+bin/hdrMaster.sh $s1 $e1 $s2 $e2
+