Mercurial > hg > cc > cirrus_work
view bin/_s2t.sh @ 4:f27061e8a9da
convert to no longer need uniq -c
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Jul 2022 19:38:30 +0100 |
parents | b4801f5696b2 |
children | f035d36cec45 |
line wrap: on
line source
#!/bin/bash # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz N=$SLURM_JOB_NUM_NODES n=$SLURM_NTASKS c=$SLURM_CPUS_PER_TASK nodename=$SLURMD_NODENAME local=$SLURM_LOCALID node=$SLURM_NODEID tPerN=$((n / N)) task=$((local + (node * tPerN))) threadsPerTask=2 pjobs=$((c / $threadsPerTask)) cc=$1 resdir=$W/hst/results/$1/$2 s1=$3 sn=$4 echo $(date) task $n.$task on $nodename:$N.$node start mkdir -p $resdir mkdir -p /dev/shm/hst doit () { echo $(date) start $1 $task $PARALLEL_SEQ; uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | python3 $WSHARED/bin/cdx2tsv.py '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' '(filename,f.split("/",maxsplit=5)[4][0])' '(url,("1" if (f.split(":",maxsplit=1)[0]=="https") else "0"))' languages > /dev/shm/hst/$1.tsv; echo $(date) moving $1 $task $PARALLEL_SEQ ; mv /dev/shm/hst/$1.tsv $resdir; echo $(date) end $1 $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task seq $s1 $sn | while read i; do if [ $((i % $n)) -eq $task ]; then printf '%03g\n' $i; fi; done | \ parallel -j $pjobs doit '{}' echo $(date) task $n.$task on $nodename:$N.$node end