Mercurial > hg > cc > cirrus_work
view bin/_s2t.sh @ 208:b1190db19d78
sic
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 Dec 2023 18:23:11 +0000 |
parents | f035d36cec45 |
children |
line wrap: on
line source
#!/bin/bash # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz N=$SLURM_JOB_NUM_NODES n=$SLURM_NTASKS c=$SLURM_CPUS_PER_TASK nodename=$SLURMD_NODENAME local=$SLURM_LOCALID node=$SLURM_NODEID task=$SLURM_PROCID threadsPerTask=2 pjobs=$((c / $threadsPerTask)) cc=$1 resdir=$W/hst/results/$cc/$2 s1=$3 sn=$4 echo $(date) task $n.$task on $nodename:$N.$node start mkdir -p $resdir doit () { echo $(date) start $1 $task $PARALLEL_SEQ uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \ python3 $WSHARED/bin/cdx2tsv.py \ '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \ '(filename,f.split("/",maxsplit=5)[4][0])' \ '(key,key.split(",")[0])' \ languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv echo $(date) moving $1 $task $PARALLEL_SEQ mv $TMPDIR/$1.tsv $resdir echo $(date) end $1 $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' echo $(date) task $n.$task on $nodename:$N.$node end