diff bin/_s2t.sh @ 2:b4801f5696b2

compute node workers, see cirrus_home/bin repo for login node masters
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 19:22:42 +0100
parents
children f035d36cec45
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_s2t.sh	Mon Jul 18 19:22:42 2022 +0100
@@ -0,0 +1,38 @@
+#!/bin/bash
+# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+
+tPerN=$((n / N))
+
+task=$((local + (node * tPerN)))
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+
+cc=$1
+resdir=$W/hst/results/$1/$2
+s1=$3
+sn=$4
+
+echo $(date) task $n.$task on $nodename:$N.$node start
+
+mkdir -p $resdir
+mkdir -p /dev/shm/hst
+
+doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])'  '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir;  echo $(date) end $1 $task $PARALLEL_SEQ ;}
+
+export -f doit
+export cc resdir n task
+
+seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \
+   parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end
+