Mercurial > hg > cc > cirrus_work
view bin/_c2t.sh @ 247:7737da0ccb8c
try adding lm to existing index from ks_0-9
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Jan 2025 14:52:14 +0000 |
parents | 94072b090fdd |
children |
line wrap: on
line source
#!/bin/bash # run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz # Args: CCmonth resSubdir s0 sn kf fieldSpecs... N=$SLURM_JOB_NUM_NODES n=$SLURM_NTASKS c=$SLURM_CPUS_PER_TASK nodename=$SLURMD_NODENAME local=$SLURM_LOCALID node=$SLURM_NODEID task=$SLURM_PROCID threadsPerTask=2 pjobs=$((c / $threadsPerTask)) cc=$1 shift resdir=$W/results/$cc/$1 shift s1=$1 shift sn=$1 shift kf=$1 # key field for sorting shift echo $(date) task $n.$task on $nodename:$N.$node $resdir start mkdir -p $resdir doit () { seg=$1 shift echo $(date) start $seg $task $PARALLEL_SEQ if [ -f $resdir/$seg.tsv ] then echo $(date) skipping $seg $task $PARALLEL_SEQ "$(ls -l $resdir/$seg.tsv)" exit 0 fi uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ python3 $WSHARED/bin/cdx2tsv.py "$@" \ | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv echo $(date) moving $seg $task $PARALLEL_SEQ to $resdir/$seg.tsv mv $TMPDIR/$seg.tsv $resdir echo $(date) end $seg $task $PARALLEL_SEQ ;} export -f doit export cc resdir n task kf $W/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs -q doit '{}' "$@" echo $(date) task $n.$task on $nodename:$N.$node end