changeset 2:b4801f5696b2

compute node workers, see cirrus_home/bin repo for login node masters
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 19:22:42 +0100
parents d5b6748f29a9
children 668579197bec
files bin/_ex1.sh bin/_nl1.sh bin/_runme.sh bin/_s2t.sh bin/_test.sh
diffstat 5 files changed, 204 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_ex1.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,45 @@
+#!/bin/bash
+# This runs on the compute nodes...
+# count top 21 solitary languages in parallel, taking input directly from /work/dc007/dc007/hst/results/$1/cdx_counts/xxx.tsv
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+cc=$1
+resdir=$W/$USER/results/$cc/$2
+srcdir=$W/hst/results/$cc/cdx_counts
+langs=$3
+s1=$4
+sn=$5
+
+echo $(date) task $n.$task on $nodename:$N.$node start
+
+mkdir -p $resdir
+
+doit () {
+ echo $(date) start $1 $task $PARALLEL_SEQ
+ fgrep '	w	' $srcdir/$1.tsv | awk 'BEGIN { | uniq -c | \
+    $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
+ echo $(date) end $1 $task $PARALLEL_SEQ 
+}
+
+export -f doit
+export srcdir resdir task
+
+seq $s1 $sn | while read i
+ do if [ $((i % $n)) -eq $task ]
+ then printf '%03g\n' $i
+ fi
+ done | \
+   parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_nl1.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,49 @@
+#!/bin/bash
+# This runs on the compute nodes...
+# Args: CC-MAIN-2019-35 nl1_counts langs 0 299
+# count languages (from file named by langs, found in resdir) in parallel, taking input directly from $cc/cdx_counts/xxx.tsv
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+cc=$1
+resdir=$W/$USER/results/$cc/$2
+srcdir=$W/hst/results/$cc/cdx_counts
+langs=$3
+langfile=$resdir/$langs
+s1=$4
+sn=$5
+
+echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
+
+mkdir -p $resdir
+
+doit () {
+ echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
+ fgrep '	w	' $srcdir/$1.tsv | \
+   awk 'BEGIN {while (getline < "'$langfile'") {l[$0]=1}}
+        {if (l[$4]) {print $1,$4}}' | uniq -c | \
+    $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
+ echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
+}
+
+export -f doit
+export srcdir resdir task langs langfile
+
+seq $s1 $sn | while read i
+ do if [ $((i % $n)) -eq $task ]
+ then printf '%03g\n' $i
+ fi
+ done | \
+   parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_runme.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,54 @@
+#!/bin/bash
+# This runs on the compute nodes...
+# Args: wd [-b CMDS] [-i input] CMDS
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+cd "$1"
+shift
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
+
+PATH=$W/$USER/bin:$W/shared/bin:$PATH
+export task PATH n
+
+if [ "$1" = "-b" ]
+then
+  shift
+  eval "$1"
+  shift
+fi
+
+if [ "$1" = "-i" ]
+then
+ shift
+ input="$1"
+ shift
+fi
+
+export cmd="$1"
+shift
+
+doit () {
+ arg="$1"
+ echo $(date) start $task $PARALLEL_SEQ $arg
+ eval "$cmd"
+ echo $(date) end $task $PARALLEL_SEQ
+}
+
+export -f doit
+
+eval "$input" | \
+   parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_s2t.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,38 @@
+#!/bin/bash
+# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+
+tPerN=$((n / N))
+
+task=$((local + (node * tPerN)))
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+
+cc=$1
+resdir=$W/hst/results/$1/$2
+s1=$3
+sn=$4
+
+echo $(date) task $n.$task on $nodename:$N.$node start
+
+mkdir -p $resdir
+mkdir -p /dev/shm/hst
+
+doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])'  '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir;  echo $(date) end $1 $task $PARALLEL_SEQ ;}
+
+export -f doit
+export cc resdir n task
+
+seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \
+   parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_test.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Usage: dotest
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+
+tPerN=$((n / N))
+
+task=$((local + (node * tPerN)))
+
+
+echo $(date) executing test on node $N:$nodename:$node task $n:$task:$SLURM_PROCID local:$local
+
+