changeset 152:0072e4ee6c67

use sqlite3 just to tabulate
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 28 Oct 2021 12:11:08 +0000
parents 66d17f7410f2
children 2b59f3ef2294
files bin/doS2C.sh bin/sql2csv.sh
diffstat 2 files changed, 58 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doS2C.sh	Thu Oct 28 12:11:08 2021 +0000
@@ -0,0 +1,20 @@
+#!/usr/bin/bash
+# Usage: doS2C.sh node task cc resdir segs workd dbfile
+node=$1
+task=$2
+cc=$3
+resdir=$4
+segs=$5
+workd=$6
+dbfile=$7
+
+f=${dbfile%.db}
+i=${f#*cdx}
+
+echo "> $node.$task: $segs $i"
+
+sqlite3 $dbfile ".mode csv" ".once $workd/$i.csv" "select count(*),* from props group by segment,ftype,https,nlangs" ".quit"
+
+echo "< $node.$task: $segs $i";
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/sql2csv.sh	Thu Oct 28 12:11:08 2021 +0000
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Invoke this as e.g. sbatch -N 4 --ntasks=3 -c 5 masterJob.sh sql2csv \
+#                     CC-MAIN-2019-35 cdx_db 20-43
+# run sql2csv.py in parallel, taking input directly from .../24-43.n.c.tar.gz
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+node=$SLURMD_NODENAME
+task=$SLURM_LOCALID
+node=$SLURM_NODEID
+
+cc=$1
+resdir=$2
+segs=$3
+
+echo $(date) $nodename:$node:$task start
+
+export PYTHONPATH=$PYTHONPATH:$HOME/lib/python
+
+ld=/dev/shm/ht/$task
+mkdir -p $ld
+
+cd $ld
+tar --wildcards -xf $HOME/results/$cc/$resdir/$segs.$node.$task.tar.gz '*.db'
+cd $HOME
+
+ls $ld/*.db | \
+   parallel --will-cite -j $c doS2C.sh $node $task $cc $resdir $segs $ld '{}'
+
+if [ "$ld" ]
+then
+ cd $ld
+ cp *.csv $HOME/results/$cc/$resdir
+ rm *
+fi
+
+echo $(date) $nodename:$node:$task end
+
+