annotate bin/psplitTars.sh @ 166:afd7879181c9

old style
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 19:15:20 +0100
parents 849ccd30258d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
57
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
2 cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt |\
56
c0c030e8b219 too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
3 parallel --will-cite -j 4 -n 1 '
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 segid={}
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 echo $segid starting
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 ccid=2019-35
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 h=/beegfs/common_crawl/CC-MAIN-${ccid}
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 mkdir -p /dev/shm/rex/${segid}
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 cd /dev/shm/rex/${segid}
56
c0c030e8b219 too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
10 tar -x --skip-old-files -f ${h}/${segid}/extracts.tar
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 cut -f 6 -d / | cut -f 3,4 -d -)
57
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
13 echo $segid/$pfx
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
14 if ls logs/*_?_log > /dev/null
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
15 then
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
16 cd logs
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
17 ls | sed "s/^\([0-9]*\)_\([0-9]*\)_log/\1 \2/" | \
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
18 while read j i; do if [ -z "$j" ]; then k=1; else k=$j; fi ; mv ${j}_${i}_log $(printf %s_%03.0f_log $k $((i - 1))); done
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
19 cd ..
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
20 fi
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 cat ../by11s.txt | while read i j
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 do ((n=i/11))
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
57
849ccd30258d final most general versin
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 56
diff changeset
24 $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"${pfx}-%05.0f_* logs/*_%03.0f_log\n\" \$k \$k ; }")
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 done
56
c0c030e8b219 too big for /dev/shm, split in half
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 55
diff changeset
26 rm -rf /dev/shm/rex/${segid}/*
55
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 echo $(date) $segid done
50556ac15e88 one-off to convert big extracts.tar into lots of smaller ones
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 '