Mercurial > hg > cc > cirrus_home
changeset 44:abc1b05996c9
complete change of array var construction, used it for log file names too, tar update enabled, so maybe complete but w/o any parallel
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 07 Apr 2020 18:00:29 +0100 |
parents | 4b574613200c |
children | bd0010ac88ce |
files | bin/reExtract.sh |
diffstat | 1 files changed, 20 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/reExtract.sh Sat Apr 04 15:31:58 2020 +0100 +++ b/bin/reExtract.sh Tue Apr 07 18:00:29 2020 +0100 @@ -41,24 +41,30 @@ # Sigh, shouldn't have used this in the extraction ... jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ cut -f 2 -d / |cut -f 1 -d _) - egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ - while read e # this could be parallel + for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) + # this could be parallel do lsf=lsl${e}.txt rm -f $lsf - set -f - ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \ - while read i # this could be parallel + lff=() + ii=() + for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) + # this could be parallel do - set +f id=${p#CC-MAIN-*}-00$i + echo "$id" 1>&2 lf=logs/${jobid}_${i}_log + lff+=("${lf}") # accumulate list of log files + echo starting ${id} $(date) > $lf unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + echo finished ${id} $(date) >> $lf ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf - set -f - echo "*-00${i}_*" - done)) + ii+=("*-00${i}_*") + echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 + done # now compare ls vs. tar + echo "${ii[@]}" ${#ii[@]} 1>&2 + echo lff "${lff[@]}" 1>&2 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt @@ -67,15 +73,18 @@ if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] then echo "extra lines in ${e}_check.txt" 1>&2 + cd .. break fi if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] then echo "non-addition lines in ${e}_check.txt" 1>&2 + cd .. break fi egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt - set +f + tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" + tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" done - cd .. + cd .. done