comparison bin/psplitTars.sh @ 56:c0c030e8b219

too big for /dev/shm, split in half
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 14 Apr 2020 17:52:34 +0100
parents 50556ac15e88
children 849ccd30258d
comparison
equal deleted inserted replaced
55:50556ac15e88 56:c0c030e8b219
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ 2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \
3 parallel --will-cite -j 8 -n 1 ' 3 parallel --will-cite -j 4 -n 1 '
4 segid={} 4 segid={}
5 echo $segid starting 5 echo $segid starting
6 ccid=2019-35 6 ccid=2019-35
7 h=/beegfs/common_crawl/CC-MAIN-${ccid} 7 h=/beegfs/common_crawl/CC-MAIN-${ccid}
8 mkdir -p /dev/shm/rex/${segid} 8 mkdir -p /dev/shm/rex/${segid}
9 cd /dev/shm/rex/${segid} 9 cd /dev/shm/rex/${segid}
10 tar -xf ${h}/${segid}/extracts.tar 10 tar -x --skip-old-files -f ${h}/${segid}/extracts.tar
11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ 11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
12 cut -f 6 -d / | cut -f 3,4 -d -) 12 cut -f 6 -d / | cut -f 3,4 -d -)
13 cat ../by11s.txt | while read i j 13 cat ../by11s.txt | while read i j
14 do ((n=i/11)) 14 do ((n=i/11))
15 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ 15 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
16 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) 16 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx)
17 done 17 done
18 rm -rf /dev/shm/rex/${segid}/*
18 echo $(date) $segid done 19 echo $(date) $segid done
19 ' 20 '