# HG changeset patch # User Henry S. Thompson # Date 1586883154 -3600 # Node ID c0c030e8b2191c75409f2a00c4371dcfb6e70f24 # Parent 50556ac15e88b35ce85d47d9305438827134fa99 too big for /dev/shm, split in half diff -r 50556ac15e88 -r c0c030e8b219 bin/psplitTars.sh --- a/bin/psplitTars.sh Tue Apr 14 16:10:22 2020 +0100 +++ b/bin/psplitTars.sh Tue Apr 14 17:52:34 2020 +0100 @@ -1,13 +1,13 @@ #!/usr/bin/bash cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ - parallel --will-cite -j 8 -n 1 ' + parallel --will-cite -j 4 -n 1 ' segid={} echo $segid starting ccid=2019-35 h=/beegfs/common_crawl/CC-MAIN-${ccid} mkdir -p /dev/shm/rex/${segid} cd /dev/shm/rex/${segid} - tar -xf ${h}/${segid}/extracts.tar + tar -x --skip-old-files -f ${h}/${segid}/extracts.tar pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ cut -f 6 -d / | cut -f 3,4 -d -) cat ../by11s.txt | while read i j @@ -15,5 +15,6 @@ tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) done + rm -rf /dev/shm/rex/${segid}/* echo $(date) $segid done '