Mercurial > hg > cc > cirrus_work
changeset 15:a9763cd18949
in progress...
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sun, 07 Aug 2022 13:56:00 +0100 |
parents | 4e1ecfa46bee |
children | 04464ee31d66 |
files | bin/_mt1.sh |
diffstat | 1 files changed, 47 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_mt1.sh Sun Aug 07 13:56:00 2022 +0100 @@ -0,0 +1,47 @@ +#!/bin/bash +# This runs on the compute nodes... +# Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299 +# count 'mime' or 'mime-detected' values (restricted by entries in file +# named by mtypes, found in resdir) in parallel, +# taking input directly from $cc/cdx_mime/xxx.tsv + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +cc=$1 +resdir=$W/$USER/results/$cc/$2 +srcdir=$W/hst/results/$cc/cdx_counts +mcol=$3 +mts=$4 +mtfile=$resdir/$mts +s1=$5 +sn=$6 + +echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2 + +mkdir -p $resdir + +doit () { + echo $(date) start $1 $task $PARALLEL_SEQ 1>&2 + fgrep ' w ' $srcdir/$1.tsv | \ + awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}} + {if (l[]) {print $1,$2,$5}}' | \ + $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv + echo $(date) end $1 $task $PARALLEL_SEQ 1>&2 +} + +export -f doit +export srcdir resdir task langs langfile + +$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' + +echo $(date) task $n.$task on $nodename:$N.$node end 1>&2 +