Mercurial > hg > cc > cirrus_work
view bin/_c2t.sh @ 178:c42a5f7c97c5
renamed to by_interval.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Nov 2023 20:40:09 +0000 |
parents | 1ce51aacc468 |
children | 94072b090fdd |
line wrap: on
line source
#!/bin/bash # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz # Args: CCmonth resSubdir s0 sn kf fieldSpecs... N=$SLURM_JOB_NUM_NODES n=$SLURM_NTASKS c=$SLURM_CPUS_PER_TASK nodename=$SLURMD_NODENAME local=$SLURM_LOCALID node=$SLURM_NODEID task=$SLURM_PROCID threadsPerTask=2 pjobs=$((c / $threadsPerTask)) cc=$1 shift resdir=$W/hst/results/$cc/$1 shift s1=$1 shift sn=$1 shift kf=$1 # key field for sorting shift echo $(date) task $n.$task on $nodename:$N.$node start mkdir -p $resdir doit () { seg=$1 shift echo $(date) start $seg $task $PARALLEL_SEQ uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ python3 $WSHARED/bin/cdx2tsv.py "$@" \ | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv echo $(date) moving $seg $task $PARALLEL_SEQ mv $TMPDIR/$seg.tsv $resdir echo $(date) end $seg $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task kf $W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" echo $(date) task $n.$task on $nodename:$N.$node end