diff bin/_c2t.sh @ 11:dfdb95e5d774

catch-up
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Jul 2022 14:45:35 +0100
parents
children 1ce51aacc468
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_c2t.sh	Thu Jul 28 14:45:35 2022 +0100
@@ -0,0 +1,48 @@
+#!/bin/bash
+# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
+# Args: CCmonth resSubdir s0 sn kf fieldSpecs...
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+cc=$1
+shift
+resdir=$W/hst/results/$cc/$2
+shift
+s1=$3
+shift
+sn=$4
+shift
+kf=$5 # key field for sorting
+shift
+
+echo $(date) task $n.$task on $nodename:$N.$node start
+
+mkdir -p $resdir
+
+doit () {
+ seg=$1
+ shift
+ echo $(date) start $seg $task $PARALLEL_SEQ
+ uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
+   python3 $WSHARED/bin/cdx2tsv.py "$@" \
+      | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
+ echo $(date) moving $seg $task $PARALLEL_SEQ 
+ mv $TMPDIR/$seg.tsv $resdir
+  echo $(date) end $seg $task $PARALLEL_SEQ ;}
+
+export -f doit
+export cc resdir n task
+
+$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@"
+
+echo $(date) task $n.$task on $nodename:$N.$node end
+