Mercurial > hg > cc > cirrus_home
comparison bin/splitTars.sh @ 64:0520ee00e35b
misc
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 19:57:16 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
63:d39fd9c7f1be | 64:0520ee00e35b |
---|---|
1 #!/bin/bash | |
2 for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt) | |
3 do | |
4 echo $segid starting | |
5 ccid=2019-35 | |
6 h=/beegfs/common_crawl/CC-MAIN-${ccid} | |
7 mkdir -p /dev/shm/rex/${segid} | |
8 cd /dev/shm/rex/${segid} | |
9 tar -xf ${h}/${segid}/extracts.tar | |
10 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ | |
11 cut -f 6 -d / | cut -f 3,4 -d -) | |
12 cat ../by11s.txt | while read i j | |
13 do ((n=i/11)) | |
14 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ | |
15 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx) | |
16 done && | |
17 echo $(date) $segid done | |
18 done |