# HG changeset patch # User Henry S. Thompson # Date 1659015935 -3600 # Node ID dfdb95e5d77441543d6812a5e6760dcdae7ccdf8 # Parent cf982df00cbd1923010584d140421e1dd88aa547 catch-up diff -r cf982df00cbd -r dfdb95e5d774 .hgignore --- a/.hgignore Sat Jul 23 11:50:46 2022 +0100 +++ b/.hgignore Thu Jul 28 14:45:35 2022 +0100 @@ -81,3 +81,4 @@ .singularity .saves-* .lesshst +results diff -r cf982df00cbd -r dfdb95e5d774 bin/_c2t.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/_c2t.sh Thu Jul 28 14:45:35 2022 +0100 @@ -0,0 +1,48 @@ +#!/bin/bash +# run cdx2tsv.py in parallel, taking input directly from cdx-00{000..299}.gz +# Args: CCmonth resSubdir s0 sn kf fieldSpecs... + +N=$SLURM_JOB_NUM_NODES +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +task=$SLURM_PROCID + +threadsPerTask=2 +pjobs=$((c / $threadsPerTask)) + +cc=$1 +shift +resdir=$W/hst/results/$cc/$2 +shift +s1=$3 +shift +sn=$4 +shift +kf=$5 # key field for sorting +shift + +echo $(date) task $n.$task on $nodename:$N.$node start + +mkdir -p $resdir + +doit () { + seg=$1 + shift + echo $(date) start $seg $task $PARALLEL_SEQ + uz /beegfs/common_crawl/$cc/cdx/warc/cdx-00$seg.gz | \ + python3 $WSHARED/bin/cdx2tsv.py "$@" \ + | sort -k$kf,$kf | uniq -c | tr -s ' ' '\t' > $TMPDIR/$seg.tsv + echo $(date) moving $seg $task $PARALLEL_SEQ + mv $TMPDIR/$seg.tsv $resdir + echo $(date) end $seg $task $PARALLEL_SEQ ;} + +export -f doit +export cc resdir n task + +$W/hst/bin/share_by_task.sh -f "%03g\n" -s $s1 $sn $n $task | parallel -j $pjobs doit '{}' "$@" + +echo $(date) task $n.$task on $nodename:$N.$node end + diff -r cf982df00cbd -r dfdb95e5d774 bin/lang_by_seg.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/lang_by_seg.sh Thu Jul 28 14:45:35 2022 +0100 @@ -0,0 +1,26 @@ +#!/bin/bash +# This runs on the compute nodes in nl1_counts inside a _runme doit +# Args: +# Computes the by-language totals for this segment + +pjobs=4 # 10 parallel invocations of this are running, +task=$PARALLEL_SEQ # {1..10} + +seg=$1 + +echo $(date) $seg as 3.$task on start + +doit () { + i=$1 + echo $(date) start $task.$seg $i $PARALLEL_SEQ 1>&2 + awk -v seg=$seg '{if ($2==seg) {ll[$3]+=$1}} END {for (l in ll) print ll[l],l}' top21s_$i.tsv + echo $(date) end $task.$seg $i $PARALLEL_SEQ 1>&2 +} + +export -f doit +export task seg + +seq -f '%03g' 0 299 | parallel -j $pjobs doit '{}' | uniq_merge.py > $seg.tsv + +echo $(date) task $n.$task on $nodename:$N.$node end 1>&2 + diff -r cf982df00cbd -r dfdb95e5d774 bin/share_by_task.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/share_by_task.sh Thu Jul 28 14:45:35 2022 +0100 @@ -0,0 +1,31 @@ +#!/bin/bash +# Usage: share_by_task.sh [-f format] [-s from to] N task +# Filter a sequence by mod N == task +# Sequence is stdin, or if -s then seq $from $t +if [ "$1" = '-f' ] +then + shift + f=$1 + shift +else + f='%s\n' +fi + +if [ "$1" = '-s' ] +then + shift + source="seq $1 $2" + shift + shift +else + source="cat" +fi + +pos=0 +${source} | while read v + do + if [ $((++pos % $1)) -eq $2 ] + then + printf "$f" $v + fi + done