Mercurial > hg > cc > cirrus_work
changeset 2:b4801f5696b2
compute node workers, see cirrus_home/bin repo for login node masters
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 19:22:42 +0100 |
parents | d5b6748f29a9 |
children | 668579197bec |
files | bin/_ex1.sh bin/_nl1.sh bin/_runme.sh bin/_s2t.sh bin/_test.sh |
diffstat | 5 files changed, 204 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_ex1.sh Mon Jul 18 19:22:42 2022 +0100 @@ -0,0 +1,45 @@ +#!/bin/bash +# This runs on the compute nodes... +# count top 21 solitary languages in parallel, taking input directly from /work/dc007/dc007/hst/results/$1/cdx_counts/xxx.tsv + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +cc=$1 +resdir=$W/$USER/results/$cc/$2 +srcdir=$W/hst/results/$cc/cdx_counts +langs=$3 +s1=$4 +sn=$5 + +echo $(date) task $n.$task on $nodename:$N.$node start + +mkdir -p $resdir + +doit () { + echo $(date) start $1 $task $PARALLEL_SEQ + fgrep ' w ' $srcdir/$1.tsv | awk 'BEGIN { | uniq -c | \ + $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv + echo $(date) end $1 $task $PARALLEL_SEQ +} + +export -f doit +export srcdir resdir task + +seq $s1 $sn | while read i + do if [ $((i % $n)) -eq $task ] + then printf '%03g\n' $i + fi + done | \ + parallel -j $pjobs doit '{}' + +echo $(date) task $n.$task on $nodename:$N.$node end +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_nl1.sh Mon Jul 18 19:22:42 2022 +0100 @@ -0,0 +1,49 @@ +#!/bin/bash +# This runs on the compute nodes... +# Args: CC-MAIN-2019-35 nl1_counts langs 0 299 +# count languages (from file named by langs, found in resdir) in parallel, taking input directly from $cc/cdx_counts/xxx.tsv + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +cc=$1 +resdir=$W/$USER/results/$cc/$2 +srcdir=$W/hst/results/$cc/cdx_counts +langs=$3 +langfile=$resdir/$langs +s1=$4 +sn=$5 + +echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2 + +mkdir -p $resdir + +doit () { + echo $(date) start $1 $task $PARALLEL_SEQ 1>&2 + fgrep ' w ' $srcdir/$1.tsv | \ + awk 'BEGIN {while (getline < "'$langfile'") {l[$0]=1}} + {if (l[$4]) {print $1,$4}}' | uniq -c | \ + $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv + echo $(date) end $1 $task $PARALLEL_SEQ 1>&2 +} + +export -f doit +export srcdir resdir task langs langfile + +seq $s1 $sn | while read i + do if [ $((i % $n)) -eq $task ] + then printf '%03g\n' $i + fi + done | \ + parallel -j $pjobs doit '{}' + +echo $(date) task $n.$task on $nodename:$N.$node end 1>&2 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_runme.sh Mon Jul 18 19:22:42 2022 +0100 @@ -0,0 +1,54 @@ +#!/bin/bash +# This runs on the compute nodes... +# Args: wd [-b CMDS] [-i input] CMDS + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +cd "$1" +shift + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2 + +PATH=$W/$USER/bin:$W/shared/bin:$PATH +export task PATH n + +if [ "$1" = "-b" ] +then + shift + eval "$1" + shift +fi + +if [ "$1" = "-i" ] +then + shift + input="$1" + shift +fi + +export cmd="$1" +shift + +doit () { + arg="$1" + echo $(date) start $task $PARALLEL_SEQ $arg + eval "$cmd" + echo $(date) end $task $PARALLEL_SEQ +} + +export -f doit + +eval "$input" | \ + parallel -j $pjobs doit '{}' + +echo $(date) task $n.$task on $nodename:$N.$node end 1>&2 +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_s2t.sh Mon Jul 18 19:22:42 2022 +0100 @@ -0,0 +1,38 @@ +#!/bin/bash +# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID + +tPerN=$((n / N)) + +task=$((local + (node * tPerN))) + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + + +cc=$1 +resdir=$W/hst/results/$1/$2 +s1=$3 +sn=$4 + +echo $(date) task $n.$task on $nodename:$N.$node start + +mkdir -p $resdir +mkdir -p /dev/shm/hst + +doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} + +export -f doit +export cc resdir n task + +seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ + parallel -j $pjobs doit '{}' + +echo $(date) task $n.$task on $nodename:$N.$node end +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_test.sh Mon Jul 18 19:22:42 2022 +0100 @@ -0,0 +1,18 @@ +#!/bin/bash +# Usage: dotest + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID + +tPerN=$((n / N)) + +task=$((local + (node * tPerN))) + + +echo $(date) executing test on node $N:$nodename:$node task $n:$task:$SLURM_PROCID local:$local + +