comparison bin/_mt1.sh @ 15:a9763cd18949

in progress...
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sun, 07 Aug 2022 13:56:00 +0100
parents
children
comparison
equal deleted inserted replaced
14:4e1ecfa46bee 15:a9763cd18949
1 #!/bin/bash
2 # This runs on the compute nodes...
3 # Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299
4 # count 'mime' or 'mime-detected' values (restricted by entries in file
5 # named by mtypes, found in resdir) in parallel,
6 # taking input directly from $cc/cdx_mime/xxx.tsv
7
8 N=$SLURM_JOB_NUM_NODES
9 n=$SLURM_NTASKS
10 c=$SLURM_CPUS_PER_TASK
11 nodename=$SLURMD_NODENAME
12 local=$SLURM_LOCALID
13 node=$SLURM_NODEID
14 task=$SLURM_PROCID
15
16 threadsPerTask=2
17 pjobs=$((c / $threadsPerTask))
18
19 cc=$1
20 resdir=$W/$USER/results/$cc/$2
21 srcdir=$W/hst/results/$cc/cdx_counts
22 mcol=$3
23 mts=$4
24 mtfile=$resdir/$mts
25 s1=$5
26 sn=$6
27
28 echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
29
30 mkdir -p $resdir
31
32 doit () {
33 echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
34 fgrep ' w ' $srcdir/$1.tsv | \
35 awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}}
36 {if (l[]) {print $1,$2,$5}}' | \
37 $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
38 echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
39 }
40
41 export -f doit
42 export srcdir resdir task langs langfile
43
44 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
45
46 echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
47