Mercurial > hg > cc > cirrus_home
annotate bin/psplitTars.sh @ 175:d123ef7fdb82
working on implementing types and parts:
1, 2, 4 working, 3 not
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 03 Jul 2023 18:16:14 +0100 |
parents | 849ccd30258d |
children |
rev | line source |
---|---|
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/bash |
57
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt |\ |
56
c0c030e8b219
too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
3 parallel --will-cite -j 4 -n 1 ' |
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 segid={} |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 echo $segid starting |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 ccid=2019-35 |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 h=/beegfs/common_crawl/CC-MAIN-${ccid} |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 mkdir -p /dev/shm/rex/${segid} |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 cd /dev/shm/rex/${segid} |
56
c0c030e8b219
too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
10 tar -x --skip-old-files -f ${h}/${segid}/extracts.tar |
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 cut -f 6 -d / | cut -f 3,4 -d -) |
57
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
13 echo $segid/$pfx |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
14 if ls logs/*_?_log > /dev/null |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
15 then |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
16 cd logs |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
17 ls | sed "s/^\([0-9]*\)_\([0-9]*\)_log/\1 \2/" | \ |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
18 while read j i; do if [ -z "$j" ]; then k=1; else k=$j; fi ; mv ${j}_${i}_log $(printf %s_%03.0f_log $k $((i - 1))); done |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
19 cd .. |
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
20 fi |
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 cat ../by11s.txt | while read i j |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 do ((n=i/11)) |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ |
57
849ccd30258d
final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
56
diff
changeset
|
24 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"${pfx}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }") |
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 done |
56
c0c030e8b219
too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
55
diff
changeset
|
26 rm -rf /dev/shm/rex/${segid}/* |
55
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 echo $(date) $segid done |
50556ac15e88
one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 ' |