changeset 11:dfdb95e5d774

catch-up
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Jul 2022 14:45:35 +0100
parents cf982df00cbd
children 1ce51aacc468
files .hgignore bin/_c2t.sh bin/lang_by_seg.sh bin/share_by_task.sh
diffstat 4 files changed, 106 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Sat Jul 23 11:50:46 2022 +0100
+++ b/.hgignore	Thu Jul 28 14:45:35 2022 +0100
@@ -81,3 +81,4 @@
 .singularity
 .saves-*
 .lesshst
+results
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/_c2t.sh	Thu Jul 28 14:45:35 2022 +0100
@@ -0,0 +1,48 @@
+#!/bin/bash
+# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz
+# Args: CCmonth resSubdir s0 sn kf fieldSpecs...
+
+N=$SLURM_JOB_NUM_NODES
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+task=$SLURM_PROCID
+
+threadsPerTask=2
+pjobs=$((c / $threadsPerTask))
+
+cc=$1
+shift
+resdir=$W/hst/results/$cc/$2
+shift
+s1=$3
+shift
+sn=$4
+shift
+kf=$5 # key field for sorting
+shift
+
+echo $(date) task $n.$task on $nodename:$N.$node start
+
+mkdir -p $resdir
+
+doit () {
+ seg=$1
+ shift
+ echo $(date) start $seg $task $PARALLEL_SEQ
+ uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \
+   python3 $WSHARED/bin/cdx2tsv.py "$@" \
+      | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv
+ echo $(date) moving $seg $task $PARALLEL_SEQ 
+ mv $TMPDIR/$seg.tsv $resdir
+  echo $(date) end $seg $task $PARALLEL_SEQ ;}
+
+export -f doit
+export cc resdir n task
+
+$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@"
+
+echo $(date) task $n.$task on $nodename:$N.$node end
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/lang_by_seg.sh	Thu Jul 28 14:45:35 2022 +0100
@@ -0,0 +1,26 @@
+#!/bin/bash
+# This runs on the compute nodes in nl1_counts inside a _runme doit
+# Args: 
+# Computes the by-language totals for this segment
+
+pjobs=4 #  10 parallel invocations of this are running, 
+task=$PARALLEL_SEQ # {1..10}
+
+seg=$1
+
+echo $(date) $seg as 3.$task on start
+
+doit () {
+ i=$1
+ echo $(date) start $task.$seg $i $PARALLEL_SEQ 1>&2
+   awk -v seg=$seg '{if ($2==seg) {ll[$3]+=$1}} END {for (l in ll) print ll[l],l}' top21s_$i.tsv
+ echo $(date) end $task.$seg $i $PARALLEL_SEQ 1>&2
+}
+
+export -f doit
+export task seg
+
+seq -f '%03g' 0 299 | parallel -j $pjobs doit '{}' | uniq_merge.py > $seg.tsv
+
+echo $(date) task $n.$task on $nodename:$N.$node end 1>&2
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/share_by_task.sh	Thu Jul 28 14:45:35 2022 +0100
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Usage: share_by_task.sh [-f format] [-s from to] N task
+# Filter a sequence by mod N == task
+# Sequence is stdin, or if -s then seq $from $t
+if [ "$1" = '-f' ]
+then
+ shift
+ f=$1
+ shift
+else
+ f='%s\n'
+fi
+
+if [ "$1" = '-s' ]
+then
+ shift
+ source="seq $1 $2"
+ shift
+ shift
+else
+ source="cat"
+fi
+
+pos=0
+${source} | while read v
+ do
+  if [ $((++pos % $1)) -eq $2 ]
+  then
+    printf "$f" $v
+  fi
+ done