changeset 15:a9763cd18949

in progress...
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 07 Aug 2022 13:56:00 +0100
parents 4e1ecfa46bee
children 04464ee31d66
files bin/_mt1.sh
diffstat 1 files changed, 47 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_mt1.sh	Sun Aug 07 13:56:00 2022 +0100
@@ -0,0 +1,47 @@
+#!/bin/bash
+# This runs on the compute nodes...
+# Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299
+# count 'mime' or 'mime-detected' values (restricted by entries in file
+#  named by mtypes, found in resdir) in parallel,
+#  taking input directly from $cc/cdx_mime/xxx.tsv
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+cc=$1
+resdir=$W/$USER/results/$cc/$2
+srcdir=$W/hst/results/$cc/cdx_counts
+mcol=$3
+mts=$4
+mtfile=$resdir/$mts
+s1=$5
+sn=$6
+
+echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
+
+mkdir -p $resdir
+
+doit () {
+ echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
+ fgrep '	w	' $srcdir/$1.tsv | \
+   awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}}
+        {if (l[]) {print $1,$2,$5}}' | \
+    $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
+ echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
+}
+
+export -f doit
+export srcdir resdir task langs langfile
+
+$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
+
+echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
+