# HG changeset patch # User Henry S. Thompson # Date 1586351464 -3600 # Node ID 2e5b3439a2ed9afb45f690f9c2c118b189f86b2a # Parent bd0010ac88ce519754040cd95e52c36b5e3d1f4a start try to work around failures diff -r bd0010ac88ce -r 2e5b3439a2ed bin/preExtract.sh --- a/bin/preExtract.sh Wed Apr 08 11:27:33 2020 +0100 +++ b/bin/preExtract.sh Wed Apr 08 14:11:04 2020 +0100 @@ -55,9 +55,9 @@ echo "$id" 1>&2 lf=logs/${jobid}_${i}_log lff+=("${lf}") # accumulate list of log files - echo starting ${id} $(date) > $lf - unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf - echo finished ${id} $(date) >> $lf + #echo starting ${id} $(date) > $lf + #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + #echo finished ${id} $(date) >> $lf ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf ii+=("*-00${i}_*") echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 @@ -68,23 +68,33 @@ tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt - cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt - ni=${#ii[@]} - if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] - then - echo "extra lines in ${e}_check.txt" 1>&2 - cd .. - break - fi - if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] - then - echo "non-addition lines in ${e}_check.txt" 1>&2 - cd .. - break - fi - egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt - tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" - tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" - done + if [ -s ${e}_diff.txt ] + then + cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt + ni=${#ii[@]} + if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] + then + if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ + ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] + then + : + else + echo "extra lines in ${e}_check.txt" 1>&2 + cd .. + continue + fi + elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] + then + echo "non-addition lines in ${e}_check.txt" 1>&2 + cd .. + continue + fi + egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt + tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" + tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" + else + echo "no diff, no update" $e + fi + done cd .. '