annotate bin/splitTars.sh @ 174:bfe9085a1d39

change account back
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 10 Jan 2023 17:48:26 +0000
parents 0520ee00e35b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
64
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 do
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 echo $segid starting
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 ccid=2019-35
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 h=/beegfs/common_crawl/CC-MAIN-${ccid}
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 mkdir -p /dev/shm/rex/${segid}
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 cd /dev/shm/rex/${segid}
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 tar -xf ${h}/${segid}/extracts.tar
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 cut -f 6 -d / | cut -f 3,4 -d -)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 cat ../by11s.txt | while read i j
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 do ((n=i/11))
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 done &&
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 echo $(date) $segid done
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 done