# HG changeset patch # User Henry S. Thompson # Date 1586877022 -3600 # Node ID 50556ac15e88b35ce85d47d9305438827134fa99 # Parent 8154560f1e3d47ef860c3f53fde3fdfdbe59dc06 one-off to convert big extracts.tar into lots of smaller ones diff -r 8154560f1e3d -r 50556ac15e88 bin/psplitTars.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/psplitTars.sh Tue Apr 14 16:10:22 2020 +0100 @@ -0,0 +1,19 @@ +#!/usr/bin/bash +cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ + parallel --will-cite -j 8 -n 1 ' + segid={} + echo $segid starting + ccid=2019-35 + h=/beegfs/common_crawl/CC-MAIN-${ccid} + mkdir -p /dev/shm/rex/${segid} + cd /dev/shm/rex/${segid} + tar -xf ${h}/${segid}/extracts.tar + pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ + cut -f 6 -d / | cut -f 3,4 -d -) + cat ../by11s.txt | while read i j + do ((n=i/11)) + tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ + $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) + done + echo $(date) $segid done +'