Mercurial > hg > cc > cirrus_home
changeset 48:307e0c44925a
log more, work around more glitches
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 10 Apr 2020 18:22:48 +0100 |
parents | 81ff28478276 |
children | 18f8bcc779e8 |
files | bin/preExtract.sh |
diffstat | 1 files changed, 22 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/preExtract.sh Fri Apr 10 18:22:24 2020 +0100 +++ b/bin/preExtract.sh Fri Apr 10 18:22:48 2020 +0100 @@ -38,12 +38,14 @@ p={2} mkdir -p $s/logs cd $s + echo $(date) starting $s/$p > log # Sigh, should not have used this in the extraction ... jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ cut -f 2 -d / |cut -f 1 -d _) for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) # this could be parallel do + echo $(date) begin extract: $e >> log lsf=lsl${e}.txt rm -f $lsf lff=() @@ -52,24 +54,32 @@ # this could be parallel do id=${p#CC-MAIN-*}-00$i - echo "$id" 1>&2 + echo " " "$id" >> log lf=logs/${jobid}_${i}_log lff+=("${lf}") # accumulate list of log files - #echo starting ${id} $(date) > $lf - #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf - #echo finished ${id} $(date) >> $lf + if [ -s $lf ] + then + echo " " $lf empty, skipping >> log + else + echo " " extracting from $id >> log + echo starting ${id} $(date) > $lf + unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + echo finished ${id} $(date) >> $lf + fi ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf ii+=("*-00${i}_*") - echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 + echo " " "$i" "${ii[@]}" ${#ii[@]} >> log done + echo " " extractions done # now compare ls vs. tar - echo "${ii[@]}" ${#ii[@]} 1>&2 - echo lff "${lff[@]}" 1>&2 + echo " " "${ii[@]}" ${#ii[@]} >> log + echo " " lff "${lff[@]}" >> log tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt if [ -s ${e}_diff.txt ] then + echo " " checking... cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt ni=${#ii[@]} if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] @@ -79,22 +89,24 @@ then : else - echo "extra lines in ${e}_check.txt" 1>&2 + echo " " "extra lines in ${e}_check.txt" >> log cd .. continue fi elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] then - echo "non-addition lines in ${e}_check.txt" 1>&2 + echo " " "non-addition lines in ${e}_check.txt" >> log cd .. continue fi + echo " " starting tar update egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" else - echo "no diff, no update" $e + echo "no diff, no update" $e >> log fi + echo end extract: $e >> log done cd .. '