Mercurial > hg > cc > cirrus_home
annotate bin/doExtract.sh @ 192:4275eb6484da
maybe triggers jdb on tests with -DdebugTest=true on command line
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 25 Sep 2024 13:51:15 +0100 |
parents | ef3533d3ac4b |
children |
rev | line source |
---|---|
27
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
29
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
2 # Usage doExtract.sh ccid jobid segid |
27
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 ccid=$1 |
29
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
4 export jobid=$2 |
27
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 segid=$3 |
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 echo $(date) $(hostname) $jobid $segid |
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 mkdir -p $segid/logs |
7a2bc060230d
first cut at doing extraction here
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 cd $segid |
29
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
9 ls /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-*.warc.gz | \ |
33
4c23b1766692
towards sub-division of resulting tar files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
30
diff
changeset
|
10 parallel --joblog job_${jobid}.log -j 18 -N 1 'id=$(echo {} | cut -f 6 -d / | cut -f 3- -d - | cut -f 1 -d .) ; \ |
35
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
11 fid=$(printf "%03.0f" $(({#} - 1))) ; \ |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
12 lf=logs/${jobid}_${fid}_log ; \ |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
13 echo starting $id $(date) >> $lf ; \ |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
14 unpigz -dp 1 -c {} | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf ; \ |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
15 echo finished ${id} $(date) >> $lf' |
29
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
16 res=$? |
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
17 echo $(date) $(hostname) $jobid $segid $res |
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
18 if [ $res = 0 ] |
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
19 then |
35
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
20 pfx=$(ls /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-*-00000.warc.gz |\ |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
21 cut -f 6 -d / | cut -f 3,4 -d -) |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
22 |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
23 cat ../by11s.txt | while read i j |
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
24 do ((n=i/11)) |
36
e912ed51146a
fixed scope pblm in tar step
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
35
diff
changeset
|
25 tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ |
e912ed51146a
fixed scope pblm in tar step
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
35
diff
changeset
|
26 $(seq $i $j | xargs -I ^ bash -c '{ k=^; printf "${0}-%05.0f_* logs/*_%03.0f_log\n" $k $k ; }' $pfx) |
35
ec99b2d1d2fc
sync up filenames and log names,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
33
diff
changeset
|
27 done && |
37
ef3533d3ac4b
clean up after ourselves
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
36
diff
changeset
|
28 echo $(date) $(hostname) $jobid /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extracts_\{0..50\}.tar && |
ef3533d3ac4b
clean up after ourselves
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
36
diff
changeset
|
29 cd .. && |
ef3533d3ac4b
clean up after ourselves
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
36
diff
changeset
|
30 rm -rf $segid |
29
a95a1b31f5e9
try to fix multi-line lossage
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
27
diff
changeset
|
31 fi |