# HG changeset patch # User Henry S. Thompson # Date 1582732040 0 # Node ID 9a1de2c4ffe3c54a80faf32c63a26d3aac7f90b3 # Parent 8e3719800479bed9b3cc332b8a2ba025989b1e3f modelled on plinks diff -r 8e3719800479 -r 9a1de2c4ffe3 bin/doHdr.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doHdr.sh Wed Feb 26 15:47:20 2020 +0000 @@ -0,0 +1,19 @@ +#!/usr/bin/bash +hn=$1 +jn=$2 +tfn=$3 + +mkdir -p /dev/shm/x$hn/${tfn} +cd /dev/shm/x$hn/${tfn} +tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.hdr' +echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2 +ls *.hdr | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\ +while read n f; do echo "$n\t"$(egrep -a '^X-HST-Target-URI: ' $f|cut -f 2 -d ' ')"\t"$(egrep -ia '^Last-Modified: ' $f|cut -f 2 -d ' '); done >> hdrs_$tfn.tsv +echo $(date) $hn moving $(ls hdrs_*.tsv|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 +mv hdrs_$tfn.tsv /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/hdrs +echo $(date) $(pwd) rm $(ls -lt hdrs_*.tsv) 1>&2 +rm * +cd .. +echo $(date) $(pwd) rmdir ${tfn} 1>&2 +rmdir ${tfn} +echo $(date) $hn finished job ${jn} for ${tfn} diff -r 8e3719800479 -r 9a1de2c4ffe3 bin/hdr.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/hdr.sh Wed Feb 26 15:47:20 2020 +0000 @@ -0,0 +1,10 @@ +#!/usr/bin/bash +echo $(date) $(hostname) +h=$(hostname) +hn=${h##*n} +if [ $hn -eq 0 ] +then seq --format="%03.0f" $1 $2 +else seq --format="%03.0f" $3 $4 +fi |\ +parallel --will-cite -j 48 -N 1 bin/doHdr.sh ${hn} '{#}' '{}' +echo $(date) $(hostname) $? diff -r 8e3719800479 -r 9a1de2c4ffe3 bin/hdrMaster.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/hdrMaster.sh Wed Feb 26 15:47:20 2020 +0000 @@ -0,0 +1,5 @@ +#!/bin/bash +# This runs on 1 machine to launch the real job on two machines +echo $(date) Launching hdr workers for "$@" +parallel --will-cite --nonall -S r1i5n0 -S r1i5n1 bin/hdr.sh "$@" +echo $(date) workers done diff -r 8e3719800479 -r 9a1de2c4ffe3 hdrJob.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hdrJob.sh Wed Feb 26 15:47:20 2020 +0000 @@ -0,0 +1,21 @@ +#!/bin/bash +# Usage: qsub -v t1=1stTar,tn=numTars plinksJob.sh +#PBS -l select=2:ncpus=36 +#PBS -l place=exclhost +#PBS -l walltime=08:00:00 +#PBS -V +#PBS -A dc007 +#PBS -N plinks + +#module load mpt + +cd ${PBS_O_WORKDIR} +((n1=tn/2)) +((s1=t1)) +((s2=t1+n1)) +((e1=s2-1)) +((e2=s1+tn-1)) +echo $(seq --format="%03.0f" $s1 $e1),$(seq --format="%03.0f" $s2 $e2) + +bin/hdrMaster.sh $s1 $e1 $s2 $e2 +