15
|
1 #!/bin/bash
|
|
2 # This runs on the compute nodes...
|
|
3 # Args: CC-MAIN-2019-35 mt1_counts mtypes 4 0 299
|
|
4 # count 'mime' or 'mime-detected' values (restricted by entries in file
|
|
5 # named by mtypes, found in resdir) in parallel,
|
|
6 # taking input directly from $cc/cdx_mime/xxx.tsv
|
|
7
|
|
8 N=$SLURM_JOB_NUM_NODES
|
|
9 n=$SLURM_NTASKS
|
|
10 c=$SLURM_CPUS_PER_TASK
|
|
11 nodename=$SLURMD_NODENAME
|
|
12 local=$SLURM_LOCALID
|
|
13 node=$SLURM_NODEID
|
|
14 task=$SLURM_PROCID
|
|
15
|
|
16 threadsPerTask=2
|
|
17 pjobs=$((c / $threadsPerTask))
|
|
18
|
|
19 cc=$1
|
|
20 resdir=$W/$USER/results/$cc/$2
|
|
21 srcdir=$W/hst/results/$cc/cdx_counts
|
|
22 mcol=$3
|
|
23 mts=$4
|
|
24 mtfile=$resdir/$mts
|
|
25 s1=$5
|
|
26 sn=$6
|
|
27
|
|
28 echo $(date) task $n.$task on $nodename:$N.$node start $(pwd) 1>&2
|
|
29
|
|
30 mkdir -p $resdir
|
|
31
|
|
32 doit () {
|
|
33 echo $(date) start $1 $task $PARALLEL_SEQ 1>&2
|
|
34 fgrep ' w ' $srcdir/$1.tsv | \
|
|
35 awk -v mc=$mcol 'BEGIN {while (getline < "'$mtfile'") {l[$0]=1}}
|
|
36 {if (l[]) {print $1,$2,$5}}' | \
|
|
37 $W/shared/bin/uniq_merge.py > $resdir/${langs}_$1.tsv
|
|
38 echo $(date) end $1 $task $PARALLEL_SEQ 1>&2
|
|
39 }
|
|
40
|
|
41 export -f doit
|
|
42 export srcdir resdir task langs langfile
|
|
43
|
|
44 $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'
|
|
45
|
|
46 echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
|
|
47
|