64
|
1 #!/bin/bash
|
|
2 for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt)
|
|
3 do
|
|
4 echo $segid starting
|
|
5 ccid=2019-35
|
|
6 h=/beegfs/common_crawl/CC-MAIN-${ccid}
|
|
7 mkdir -p /dev/shm/rex/${segid}
|
|
8 cd /dev/shm/rex/${segid}
|
|
9 tar -xf ${h}/${segid}/extracts.tar
|
|
10 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
|
|
11 cut -f 6 -d / | cut -f 3,4 -d -)
|
|
12 cat ../by11s.txt | while read i j
|
|
13 do ((n=i/11))
|
|
14 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
|
|
15 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx)
|
|
16 done &&
|
|
17 echo $(date) $segid done
|
|
18 done
|