changeset 57:849ccd30258d

final most general versin
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 15 Apr 2020 18:44:18 +0100
parents c0c030e8b219
children 4f31d3234620
files bin/psplitTars.sh
diffstat 1 files changed, 10 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/bin/psplitTars.sh	Tue Apr 14 17:52:34 2020 +0100
+++ b/bin/psplitTars.sh	Wed Apr 15 18:44:18 2020 +0100
@@ -1,5 +1,5 @@
 #!/usr/bin/bash
-cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \
+cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt |\
   parallel --will-cite -j 4 -n 1 '
     segid={}
     echo $segid starting
@@ -10,10 +10,18 @@
     tar -x --skip-old-files -f ${h}/${segid}/extracts.tar
     pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
 	  cut -f 6 -d / | cut -f 3,4 -d -)
+    echo $segid/$pfx
+    if ls logs/*_?_log > /dev/null
+    then
+      cd logs
+      ls | sed "s/^\([0-9]*\)_\([0-9]*\)_log/\1 \2/" | \
+        while read j i; do if [ -z "$j" ]; then k=1; else k=$j; fi ; mv ${j}_${i}_log $(printf %s_%03.0f_log $k $((i - 1))); done
+      cd ..
+    fi
     cat ../by11s.txt | while read i j
        do ((n=i/11))
        tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
-	 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx)
+	 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"${pfx}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }")
        done
     rm -rf /dev/shm/rex/${segid}/*
     echo $(date) $segid done