changeset 55:50556ac15e88

one-off to convert big extracts.tar into lots of smaller ones
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 16:10:22 +0100
parents 8154560f1e3d
children c0c030e8b219
files bin/psplitTars.sh
diffstat 1 files changed, 19 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/psplitTars.sh	Tue Apr 14 16:10:22 2020 +0100
@@ -0,0 +1,19 @@
+#!/usr/bin/bash
+cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \
+  parallel --will-cite -j 8 -n 1 '
+    segid={}
+    echo $segid starting
+    ccid=2019-35
+    h=/beegfs/common_crawl/CC-MAIN-${ccid}
+    mkdir -p /dev/shm/rex/${segid}
+    cd /dev/shm/rex/${segid}
+    tar -xf ${h}/${segid}/extracts.tar
+    pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
+	  cut -f 6 -d / | cut -f 3,4 -d -)
+    cat ../by11s.txt | while read i j
+       do ((n=i/11))
+       tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
+	 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx)
+       done
+    echo $(date) $segid done
+'