# HG changeset patch # User Henry S. Thompson # Date 1658164587 -3600 # Node ID ef961d91eea584e160deffdcca8cd95f8e5b6976 # Parent e82981075b4a9591e0a38468b029a81a414c0afd previous approach to lang/field extraction diff -r e82981075b4a -r ef961d91eea5 bin/cdx_tab.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/cdx_tab.sh Mon Jul 18 18:16:27 2022 +0100 @@ -0,0 +1,17 @@ +#!/bin/bash +# Invoke this as e.g. sbatch -n 30 -c 10 masterJob.sh cdx_segment CC-MAIN-2019-35 +# run cdx_segment.py in parallel +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +node=$SLURMD_NODENAME +local=$SLURM_LOCALID +proc=$SLURM_PROCID +echo $(date) $node:$proc start + +type parallel +#module load gnu-parallel + +PYTHONPATH=$PYTHONPATH:$HOME/lib/python +parallel --will-cite -j $c doC2T.sh "$1" '{}' < cdx_tab/$proc.txt + +echo $(date) $proc end diff -r e82981075b4a -r ef961d91eea5 bin/clmp.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/clmp.sh Mon Jul 18 18:16:27 2022 +0100 @@ -0,0 +1,17 @@ +#!/bin/bash +# Invoke this as e.g. sbatch --time=5:00:00 --exclusive -N 5 --ntasks-per-node 2 -c 1 masterJob.sh clmp CC-MAIN-2019-35 cdx_tab1 +# run clm.py in parallel +n=$SLURM_NTASKS +c=$SLURM_CPUS_PER_TASK +nodename=$SLURMD_NODENAME +local=$SLURM_LOCALID +node=$SLURM_NODEID +echo $(date) $nodename:$node start + +type parallel +#module load gnu-parallel + +export PYTHONPATH=$PYTHONPATH:$HOME/lib/python +parallel --will-cite -j $c doCLM.sh "$1" '{}' < $2/$node.txt + +echo $(date) $nodename:$node end diff -r e82981075b4a -r ef961d91eea5 bin/doC2T.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doC2T.sh Mon Jul 18 18:16:27 2022 +0100 @@ -0,0 +1,10 @@ +#!/usr/bin/bash +c=$1 +i=$2 +f=$(printf 'cdx-%05.0f.gz' $i) +unpigz -dp 1 -c data/$c/cdx/warc/$f | cdx2tsv.py '(filename,f.split("/",maxsplit=5)[4][0])' '(url,f.split(":",maxsplit=1)[0])' mime mime-detected languages > /dev/shm/tbl_${i}.raw.tsv 2> /dev/shm/tbl_${i}.errs +#| tee /dev/shm/data_${i} +#wc -l /dev/shm/data_${i} /dev/shm/tbl_${i}.raw.tsv +mv /dev/shm/tbl_${i}.{errs,raw.tsv} results +#rm /dev/shm/data_${i} + diff -r e82981075b4a -r ef961d91eea5 bin/doCLM.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doCLM.sh Mon Jul 18 18:16:27 2022 +0100 @@ -0,0 +1,12 @@ +#!/usr/bin/bash +mkdir -p /dev/shm/hst + +c=$1 +i=$2 +f=$(printf 'cdx-%05.0f.gz' $i) + +unpigz -dp 1 -c data/$c/cdx/warc/$f | parallel --willcite --pipe -N 50000 -j10 "ix.py -x -h -c '/lustre/home/dc007/hst/bin/clm.sh /dev/shm/hst/'$i'.lmh_{#}.txt' 2>/dev/shm/hst/$i.lmh_{#}.errs" + +cd /dev/shm/hst +tar -czf $HOME/results/${i}.lmh.tar.gz ${i}.lmh_*.{txt,errs} +