# HG changeset patch # User Henry S. Thompson # Date 1586972658 -3600 # Node ID 849ccd30258daffe558287cf70496d8a8d78133b # Parent c0c030e8b2191c75409f2a00c4371dcfb6e70f24 final most general versin diff -r c0c030e8b219 -r 849ccd30258d bin/psplitTars.sh --- a/bin/psplitTars.sh Tue Apr 14 17:52:34 2020 +0100 +++ b/bin/psplitTars.sh Wed Apr 15 18:44:18 2020 +0100 @@ -1,5 +1,5 @@ #!/usr/bin/bash -cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ +cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt |\ parallel --will-cite -j 4 -n 1 ' segid={} echo $segid starting @@ -10,10 +10,18 @@ tar -x --skip-old-files -f ${h}/${segid}/extracts.tar pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ cut -f 6 -d / | cut -f 3,4 -d -) + echo $segid/$pfx + if ls logs/*_?_log > /dev/null + then + cd logs + ls | sed "s/^\([0-9]*\)_\([0-9]*\)_log/\1 \2/" | \ + while read j i; do if [ -z "$j" ]; then k=1; else k=$j; fi ; mv ${j}_${i}_log $(printf %s_%03.0f_log $k $((i - 1))); done + cd .. + fi cat ../by11s.txt | while read i j do ((n=i/11)) tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ - $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) + $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"${pfx}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }") done rm -rf /dev/shm/rex/${segid}/* echo $(date) $segid done