changeset 56:c0c030e8b219

too big for /dev/shm, split in half
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 17:52:34 +0100
parents 50556ac15e88
children 849ccd30258d
files bin/psplitTars.sh
diffstat 1 files changed, 3 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/psplitTars.sh	Tue Apr 14 16:10:22 2020 +0100
+++ b/bin/psplitTars.sh	Tue Apr 14 17:52:34 2020 +0100
@@ -1,13 +1,13 @@
 #!/usr/bin/bash
 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \
-  parallel --will-cite -j 8 -n 1 '
+  parallel --will-cite -j 4 -n 1 '
     segid={}
     echo $segid starting
     ccid=2019-35
     h=/beegfs/common_crawl/CC-MAIN-${ccid}
     mkdir -p /dev/shm/rex/${segid}
     cd /dev/shm/rex/${segid}
-    tar -xf ${h}/${segid}/extracts.tar
+    tar -x --skip-old-files -f ${h}/${segid}/extracts.tar
     pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
 	  cut -f 6 -d / | cut -f 3,4 -d -)
     cat ../by11s.txt | while read i j
@@ -15,5 +15,6 @@
        tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
 	 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx)
        done
+    rm -rf /dev/shm/rex/${segid}/*
     echo $(date) $segid done
 '