Mercurial > hg > cc > work
changeset 51:dc24bb6e524f
done cdx_aux for segments 49--55 of 2019-35
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 09 Oct 2024 22:55:27 +0100 |
parents | 5556c04c7597 |
children | 8dffb8aa33da |
files | lurid3/notes.txt |
diffstat | 1 files changed, 32 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/lurid3/notes.txt Wed Oct 09 09:43:07 2024 +0100 +++ b/lurid3/notes.txt Wed Oct 09 22:55:27 2024 +0100 @@ -874,3 +874,35 @@ 372 Every file is being produced twice. + +Took me a while to figure out my own code :-( + + >: sbatch --output=slurm_aug_cdx_49_360-599-out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 49 49 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg + export SEG=$xarg + share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH + ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv' + +Oops, only 560, not 600 + +Took 3.5 minutes for 200, so call it 10 for 560, so do 6 more in an +hour: + + >: sbatch --output=slurm_aug_cdx_50-55_out --time=01:00:00 --ntasks=10 -c 36 --exclusive $HOME/bin/runme.sh -m 50 55 $PWD -t 18 -b 'export resdir=CC-MAIN-2019-35/aug_cdx/$xarg +mkdir -p $resdir +> export SEG=$xarg +share_by_task.sh -f "%03g\n" -s 360 599 $n $task > /tmp/hst_$task' -i 'cat /tmp/hst_$task' 'export PYTHONPATH=./lib/python/cc:$PYTHONPATH + ~/lib/python/cc/cdx_extras.py /beegfs/common_crawl/CC-MAIN-2019-35/*.$SEG/orig/warc/CC-MAIN-*-*-00${arg}.warc.gz > $resdir/00${arg}.tsv' + + >: tail slurm_aug_cdx_50-55_out + ... + Wed Oct 9 22:25:47 BST 2024 Finished 55 + >: head -1 slurm_aug_cdx_50-55_out + Wed Oct 9 21:29:43 BST + 56:04 + + >: du -s CC-MAIN-2019-35/aug_cdx + 1,902,916 + +Not bad, so order 20MB for the whole thing + +Next step, compare to my existing cdx with timestamp