annotate bin/_mt1.sh @ 222:ee34498c6762

now using clean 2005 count
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 28 Feb 2024 14:44:59 +0000
parents a9763cd18949
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
15
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # This runs on the compute nodes...
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 # count 'mime' or 'mime-detected' values (restricted by entries in file
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 # named by mtypes, found in resdir) in parallel,
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 # taking input directly from $cc/cdx_mime/xxx.tsv
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 N=$SLURM_JOB_NUM_NODES
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 n=$SLURM_NTASKS
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 c=$SLURM_CPUS_PER_TASK
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 nodename=$SLURMD_NODENAME
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 local=$SLURM_LOCALID
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 node=$SLURM_NODEID
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 task=$SLURM_PROCID
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 threadsPerTask=2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 pjobs=$((c / $threadsPerTask))
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 cc=$1
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 resdir=$W/$USER/results/$cc/$2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 srcdir=$W/hst/results/$cc/cdx_counts
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 mcol=$3
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 mts=$4
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 mtfile=$resdir/$mts
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 s1=$5
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 sn=$6
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 mkdir -p $resdir
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 doit () {
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 fgrep ' w ' $srcdir/$1.tsv | \
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}}
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 {if (l[]) {print $1,$2,$5}}' | \
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 }
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 export -f doit
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 export srcdir resdir task langs langfile
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
a9763cd18949 in progress...
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47