Mercurial > hg > cc > cirrus_home
changeset 35:ec99b2d1d2fc
sync up filenames and log names,
attempt to save more smaller tar files
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 26 Mar 2020 12:24:30 +0000 |
parents | e34cda3e7483 |
children | e912ed51146a |
files | bin/doExtract.sh |
diffstat | 1 files changed, 14 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/doExtract.sh Thu Mar 26 12:23:33 2020 +0000 +++ b/bin/doExtract.sh Thu Mar 26 12:24:30 2020 +0000 @@ -8,14 +8,22 @@ cd $segid ls /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-*.warc.gz | \ parallel --joblog job_${jobid}.log -j 18 -N 1 'id=$(echo {} | cut -f 6 -d / | cut -f 3- -d - | cut -f 1 -d .) ; \ - echo starting $id $(date) >> logs/${jobid}_{#}_log ; \ - unpigz -dp 1 -c {} | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> logs/${jobid}_{#}_log ; \ - echo finished ${id} $(date) >> logs/${jobid}_{#}_log' + fid=$(printf "%03.0f" $(({#} - 1))) ; \ + lf=logs/${jobid}_${fid}_log ; \ + echo starting $id $(date) >> $lf ; \ + unpigz -dp 1 -c {} | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf ; \ + echo finished ${id} $(date) >> $lf' res=$? echo $(date) $(hostname) $jobid $segid $res if [ $res = 0 ] then - cat ../by11s.txt | while read - tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extracts.tar * && - echo $(date) $(hostname) $jobid /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extracts.tar + pfx=$(ls /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-*-00000.warc.gz |\ + cut -f 6 -d / | cut -f 3,4 -d -) + + cat ../by11s.txt | while read i j + do ((n=i/11)) + tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar.gz \ + $(seq $i $j | xargs -I ^ bash -c '{ k=^; printf "${pfx}-%05.0f_* logs/?_%03.0f_log\n" $k $k ; }') + done && + echo $(date) $(hostname) $jobid /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extracts_\{0..${n}\}.tar fi