view bin/_mt1.sh @ 93:25bd398a8035

improve reordering, still failing on cdx-00004
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Sep 2023 18:51:21 +0100
parents a9763cd18949
children
line wrap: on
line source

#!/bin/bash
# This runs on the compute nodes...
# Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299
# count 'mime' or 'mime-detected' values (restricted by entries in file
#  named by mtypes, found in resdir) in parallel,
#  taking input directly from $cc/cdx_mime/xxx.tsv

N=$SLURM_JOB_NUM_NODES
n=$SLURM_NTASKS
c=$SLURM_CPUS_PER_TASK
nodename=$SLURMD_NODENAME
local=$SLURM_LOCALID
node=$SLURM_NODEID
task=$SLURM_PROCID

threadsPerTask=2
pjobs=$((c / $threadsPerTask))

cc=$1
resdir=$W/$USER/results/$cc/$2
srcdir=$W/hst/results/$cc/cdx_counts
mcol=$3
mts=$4
mtfile=$resdir/$mts
s1=$5
sn=$6

echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2

mkdir -p $resdir

doit () {
 echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
 fgrep '	w	' $srcdir/$1.tsv | \
   awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}}
        {if (l[]) {print $1,$2,$5}}' | \
    $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
 echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
}

export -f doit
export srcdir resdir task langs langfile

$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'

echo $(date) task $n.$task on $nodename:$N.$node end 1>&2