changeset 46:2e5b3439a2ed

start try to work around failures
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 08 Apr 2020 14:11:04 +0100
parents bd0010ac88ce
children 81ff28478276
files bin/preExtract.sh
diffstat 1 files changed, 31 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/bin/preExtract.sh	Wed Apr 08 11:27:33 2020 +0100
+++ b/bin/preExtract.sh	Wed Apr 08 14:11:04 2020 +0100
@@ -55,9 +55,9 @@
 	    echo "$id" 1>&2
 	    lf=logs/${jobid}_${i}_log
 	    lff+=("${lf}") # accumulate list of log files
-	    echo starting ${id} $(date) > $lf
-	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
-	    echo finished ${id} $(date) >> $lf
+	    #echo starting ${id} $(date) > $lf
+	    #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+	    #echo finished ${id} $(date) >> $lf
 	    ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
 	    ii+=("*-00${i}_*")
 	    echo "$i" "${ii[@]}" ${#ii[@]} 1>&2
@@ -68,23 +68,33 @@
 	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
 	       tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
 	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
-        cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
-	ni=${#ii[@]}
-	if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
-	then
-	    echo "extra lines in ${e}_check.txt" 1>&2
-	    cd ..
-	    break
-	fi
-	if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
-	then
-	    echo "non-addition lines in ${e}_check.txt" 1>&2
-	    cd ..
-	    break
-	fi
-	egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
-	tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
-	tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
-     done
+        if [ -s ${e}_diff.txt ]
+        then
+	  cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
+	  ni=${#ii[@]}
+	  if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
+	  then
+              if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \
+           ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]]
+              then
+               :
+              else
+		echo "extra lines in ${e}_check.txt" 1>&2
+		cd ..
+		continue
+              fi
+	  elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
+	  then
+	      echo "non-addition lines in ${e}_check.txt" 1>&2
+	      cd ..
+	      continue
+	  fi
+	  egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
+	  tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
+	  tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
+        else
+          echo "no diff, no update" $e
+        fi
+   done
     cd ..
 '