Mercurial > hg > cc > cirrus_home
view bin/splitTars.sh @ 140:0a447db5cf1c
move to ec164.guest
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 19 Oct 2021 12:55:30 +0000 |
parents | 0520ee00e35b |
children |
line wrap: on
line source
#!/bin/bash for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt) do echo $segid starting ccid=2019-35 h=/beegfs/common_crawl/CC-MAIN-${ccid} mkdir -p /dev/shm/rex/${segid} cd /dev/shm/rex/${segid} tar -xf ${h}/${segid}/extracts.tar pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ cut -f 6 -d / | cut -f 3,4 -d -) cat ../by11s.txt | while read i j do ((n=i/11)) tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx) done && echo $(date) $segid done done