Mercurial > hg > cc > cirrus_home
comparison bin/psplitTars.sh @ 56:c0c030e8b219
too big for /dev/shm, split in half
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 14 Apr 2020 17:52:34 +0100 |
parents | 50556ac15e88 |
children | 849ccd30258d |
comparison
equal
deleted
inserted
replaced
55:50556ac15e88 | 56:c0c030e8b219 |
---|---|
1 #!/usr/bin/bash | 1 #!/usr/bin/bash |
2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ | 2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt | \ |
3 parallel --will-cite -j 8 -n 1 ' | 3 parallel --will-cite -j 4 -n 1 ' |
4 segid={} | 4 segid={} |
5 echo $segid starting | 5 echo $segid starting |
6 ccid=2019-35 | 6 ccid=2019-35 |
7 h=/beegfs/common_crawl/CC-MAIN-${ccid} | 7 h=/beegfs/common_crawl/CC-MAIN-${ccid} |
8 mkdir -p /dev/shm/rex/${segid} | 8 mkdir -p /dev/shm/rex/${segid} |
9 cd /dev/shm/rex/${segid} | 9 cd /dev/shm/rex/${segid} |
10 tar -xf ${h}/${segid}/extracts.tar | 10 tar -x --skip-old-files -f ${h}/${segid}/extracts.tar |
11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ | 11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ |
12 cut -f 6 -d / | cut -f 3,4 -d -) | 12 cut -f 6 -d / | cut -f 3,4 -d -) |
13 cat ../by11s.txt | while read i j | 13 cat ../by11s.txt | while read i j |
14 do ((n=i/11)) | 14 do ((n=i/11)) |
15 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ | 15 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ |
16 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) | 16 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }" $pfx) |
17 done | 17 done |
18 rm -rf /dev/shm/rex/${segid}/* | |
18 echo $(date) $segid done | 19 echo $(date) $segid done |
19 ' | 20 ' |