view bin/psplitTars.sh @ 129:b51d65ed6c89

improve error handling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 09 Jul 2021 13:45:43 +0000
parents 849ccd30258d
children
line wrap: on
line source

#!/usr/bin/bash
cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt |\
  parallel --will-cite -j 4 -n 1 '
    segid={}
    echo $segid starting
    ccid=2019-35
    h=/beegfs/common_crawl/CC-MAIN-${ccid}
    mkdir -p /dev/shm/rex/${segid}
    cd /dev/shm/rex/${segid}
    tar -x --skip-old-files -f ${h}/${segid}/extracts.tar
    pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
	  cut -f 6 -d / | cut -f 3,4 -d -)
    echo $segid/$pfx
    if ls logs/*_?_log > /dev/null
    then
      cd logs
      ls | sed "s/^\([0-9]*\)_\([0-9]*\)_log/\1 \2/" | \
        while read j i; do if [ -z "$j" ]; then k=1; else k=$j; fi ; mv ${j}_${i}_log $(printf %s_%03.0f_log $k $((i - 1))); done
      cd ..
    fi
    cat ../by11s.txt | while read i j
       do ((n=i/11))
       tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
	 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"${pfx}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }")
       done
    rm -rf /dev/shm/rex/${segid}/*
    echo $(date) $segid done
'