changeset 48:307e0c44925a

log more, work around more glitches
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 10 Apr 2020 18:22:48 +0100
parents 81ff28478276
children 18f8bcc779e8
files bin/preExtract.sh
diffstat 1 files changed, 22 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/bin/preExtract.sh	Fri Apr 10 18:22:24 2020 +0100
+++ b/bin/preExtract.sh	Fri Apr 10 18:22:48 2020 +0100
@@ -38,12 +38,14 @@
     p={2}
     mkdir -p $s/logs
     cd $s
+    echo $(date) starting $s/$p > log
     # Sigh, should not have used this in the extraction ...
     jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
 	     cut -f 2 -d / |cut -f 1 -d _)
     for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
      # this could be parallel
      do
+        echo $(date) begin extract: $e >> log
 	lsf=lsl${e}.txt
 	rm -f $lsf
 	lff=()
@@ -52,24 +54,32 @@
 	 # this could be parallel
 	 do
 	    id=${p#CC-MAIN-*}-00$i
-	    echo "$id" 1>&2
+	    echo " " "$id" >> log
 	    lf=logs/${jobid}_${i}_log
 	    lff+=("${lf}") # accumulate list of log files
-	    #echo starting ${id} $(date) > $lf
-	    #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
-	    #echo finished ${id} $(date) >> $lf
+            if [ -s $lf ]
+            then
+              echo "  " $lf empty, skipping >> log
+            else
+              echo "  " extracting from $id >> log
+	      echo starting ${id} $(date) > $lf
+	      unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+              echo finished ${id} $(date) >> $lf
+            fi
 	    ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
 	    ii+=("*-00${i}_*")
-	    echo "$i" "${ii[@]}" ${#ii[@]} 1>&2
+	    echo "  " "$i" "${ii[@]}" ${#ii[@]} >> log
 	 done
+        echo " " extractions done
 	# now compare ls vs. tar
-	echo "${ii[@]}"  ${#ii[@]} 1>&2
-	echo lff "${lff[@]}" 1>&2
+	echo " " "${ii[@]}"  ${#ii[@]} >> log
+	echo " " lff "${lff[@]}" >> log
 	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
 	       tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
 	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
         if [ -s ${e}_diff.txt ]
         then
+          echo " " checking...
 	  cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
 	  ni=${#ii[@]}
 	  if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
@@ -79,22 +89,24 @@
               then
                :
               else
-		echo "extra lines in ${e}_check.txt" 1>&2
+		echo " " "extra lines in ${e}_check.txt" >> log
 		cd ..
 		continue
               fi
 	  elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
 	  then
-	      echo "non-addition lines in ${e}_check.txt" 1>&2
+	      echo " " "non-addition lines in ${e}_check.txt" >> log
 	      cd ..
 	      continue
 	  fi
+          echo " " starting tar update
 	  egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
 	  tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
 	  tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
         else
-          echo "no diff, no update" $e
+          echo "no diff, no update" $e >> log
         fi
+        echo end extract: $e >> log
    done
     cd ..
 '