view bin/splitTars.sh @ 184:53a8ffe06460

use csing, and _runme_c.sh to get it initialised
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 Sep 2023 21:44:48 +0100
parents 0520ee00e35b
children
line wrap: on
line source

#!/bin/bash
for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt)
do
  echo $segid starting
  ccid=2019-35
  h=/beegfs/common_crawl/CC-MAIN-${ccid}
  mkdir -p /dev/shm/rex/${segid}
  cd /dev/shm/rex/${segid}
  tar -xf ${h}/${segid}/extracts.tar
  pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
        cut -f 6 -d / | cut -f 3,4 -d -)
  cat ../by11s.txt | while read i j
     do ((n=i/11))
     tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
       $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx)
     done &&
  echo $(date) $segid done
done