changeset 163:ef961d91eea5

previous approach to lang/field extraction
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 18:16:27 +0100
parents e82981075b4a
children 00b14a35280e
files bin/cdx_tab.sh bin/clmp.sh bin/doC2T.sh bin/doCLM.sh
diffstat 4 files changed, 56 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/cdx_tab.sh	Mon Jul 18 18:16:27 2022 +0100
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Invoke this as e.g. sbatch -n 30 -c 10 masterJob.sh cdx_segment CC-MAIN-2019-35
+# run cdx_segment.py in parallel 
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+node=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+proc=$SLURM_PROCID
+echo $(date) $node:$proc start
+
+type parallel
+#module load gnu-parallel
+
+PYTHONPATH=$PYTHONPATH:$HOME/lib/python
+parallel --will-cite -j $c doC2T.sh "$1" '{}' < cdx_tab/$proc.txt
+
+echo $(date) $proc end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/clmp.sh	Mon Jul 18 18:16:27 2022 +0100
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Invoke this as e.g. sbatch --time=5:00:00 --exclusive -N 5 --ntasks-per-node 2 -c 1 masterJob.sh clmp CC-MAIN-2019-35 cdx_tab1
+# run clm.py in parallel 
+n=$SLURM_NTASKS
+c=$SLURM_CPUS_PER_TASK
+nodename=$SLURMD_NODENAME
+local=$SLURM_LOCALID
+node=$SLURM_NODEID
+echo $(date) $nodename:$node start
+
+type parallel
+#module load gnu-parallel
+
+export PYTHONPATH=$PYTHONPATH:$HOME/lib/python
+parallel --will-cite -j $c doCLM.sh "$1" '{}' < $2/$node.txt
+
+echo $(date) $nodename:$node end
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doC2T.sh	Mon Jul 18 18:16:27 2022 +0100
@@ -0,0 +1,10 @@
+#!/usr/bin/bash
+c=$1
+i=$2
+f=$(printf 'cdx-%05.0f.gz' $i)
+unpigz -dp 1 -c data/$c/cdx/warc/$f  | cdx2tsv.py '(filename,f.split("/",maxsplit=5)[4][0])' '(url,f.split(":",maxsplit=1)[0])' mime mime-detected languages > /dev/shm/tbl_${i}.raw.tsv 2> /dev/shm/tbl_${i}.errs
+#| tee /dev/shm/data_${i}
+#wc -l /dev/shm/data_${i} /dev/shm/tbl_${i}.raw.tsv
+mv /dev/shm/tbl_${i}.{errs,raw.tsv} results 
+#rm /dev/shm/data_${i}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/doCLM.sh	Mon Jul 18 18:16:27 2022 +0100
@@ -0,0 +1,12 @@
+#!/usr/bin/bash
+mkdir -p /dev/shm/hst
+
+c=$1
+i=$2
+f=$(printf 'cdx-%05.0f.gz' $i)
+
+unpigz -dp 1 -c data/$c/cdx/warc/$f  |  parallel --willcite --pipe -N 50000 -j10 "ix.py -x -h -c '/lustre/home/dc007/hst/bin/clm.sh /dev/shm/hst/'$i'.lmh_{#}.txt' 2>/dev/shm/hst/$i.lmh_{#}.errs"
+
+cd /dev/shm/hst
+tar -czf $HOME/results/${i}.lmh.tar.gz ${i}.lmh_*.{txt,errs}
+