# HG changeset patch # User Henry S. Thompson # Date 1635423068 0 # Node ID 0072e4ee6c678254512ae80684d20c759dfe18dc # Parent 66d17f7410f2e1c333c99635a6cc4faadaa50555 use sqlite3 just to tabulate diff -r 66d17f7410f2 -r 0072e4ee6c67 bin/doS2C.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doS2C.sh Thu Oct 28 12:11:08 2021 +0000 @@ -0,0 +1,20 @@ +#!/usr/bin/bash +# Usage: doS2C.sh node task cc resdir segs workd dbfile +node=$1 +task=$2 +cc=$3 +resdir=$4 +segs=$5 +workd=$6 +dbfile=$7 + +f=${dbfile%.db} +i=${f#*cdx} + +echo "> $node.$task: $segs $i" + +sqlite3 $dbfile ".mode csv" ".once $workd/$i.csv" "select count(*),* from props group by segment,ftype,https,nlangs" ".quit" + +echo "< $node.$task: $segs $i"; + + diff -r 66d17f7410f2 -r 0072e4ee6c67 bin/sql2csv.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/sql2csv.sh Thu Oct 28 12:11:08 2021 +0000 @@ -0,0 +1,38 @@ +#!/bin/bash +# Invoke this as e.g. sbatch -N 4 --ntasks=3 -c 5 masterJob.sh sql2csv \ +# CC-MAIN-2019-35 cdx_db 20-43 +# run sql2csv.py in parallel, taking input directly from .../24-43.n.c.tar.gz +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +node=$SLURMD_NODENAME +task=$SLURM_LOCALID +node=$SLURM_NODEID + +cc=$1 +resdir=$2 +segs=$3 + +echo $(date) $nodename:$node:$task start + +export PYTHONPATH=$PYTHONPATH:$HOME/lib/python + +ld=/dev/shm/ht/$task +mkdir -p $ld + +cd $ld +tar --wildcards -xf $HOME/results/$cc/$resdir/$segs.$node.$task.tar.gz '*.db' +cd $HOME + +ls $ld/*.db | \ + parallel --will-cite -j $c doS2C.sh $node $task $cc $resdir $segs $ld '{}' + +if [ "$ld" ] +then + cd $ld + cp *.csv $HOME/results/$cc/$resdir + rm * +fi + +echo $(date) $nodename:$node:$task end + +