view bin/_s2t.sh @ 109:52c6a9b0fc8c

loosen must-match criterion in the both-messy case
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:29:41 +0100
parents f035d36cec45
children
line wrap: on
line source

#!/bin/bash
# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz

N=$SLURM_JOB_NUM_NODES
n=$SLURM_NTASKS
c=$SLURM_CPUS_PER_TASK
nodename=$SLURMD_NODENAME
local=$SLURM_LOCALID
node=$SLURM_NODEID
task=$SLURM_PROCID

threadsPerTask=2
pjobs=$((c / $threadsPerTask))

cc=$1
resdir=$W/hst/results/$cc/$2
s1=$3
sn=$4

echo $(date) task $n.$task on $nodename:$N.$node start

mkdir -p $resdir

doit () {
 echo $(date) start $1 $task $PARALLEL_SEQ
 uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$1.gz | \
   python3 $WSHARED/bin/cdx2tsv.py \
     '(filename,f.split("/",maxsplit=5)[3].split(".")[1])' \
     '(filename,f.split("/",maxsplit=5)[4][0])' \
     '(key,key.split(",")[0])' \
     languages | sort -k2,2 | uniq -c | tr -s ' ' '\t' > $TMPDIR/$1.tsv
 echo $(date) moving $1 $task $PARALLEL_SEQ 
 mv $TMPDIR/$1.tsv $resdir
  echo $(date) end $1 $task $PARALLEL_SEQ ;}

export -f doit
export cc resdir n task

$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}'

echo $(date) task $n.$task on $nodename:$N.$node end