changeset 44:abc1b05996c9

complete change of array var construction, used it for log file names too, tar update enabled, so maybe complete but w/o any parallel
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 07 Apr 2020 18:00:29 +0100
parents 4b574613200c
children bd0010ac88ce
files bin/reExtract.sh
diffstat 1 files changed, 20 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/bin/reExtract.sh	Sat Apr 04 15:31:58 2020 +0100
+++ b/bin/reExtract.sh	Tue Apr 07 18:00:29 2020 +0100
@@ -41,24 +41,30 @@
     # Sigh, shouldn't have used this in the extraction ...
     jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
 	     cut -f 2 -d / |cut -f 1 -d _)
-    egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
-     while read e # this could be parallel
+    for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
+     # this could be parallel
      do
 	lsf=lsl${e}.txt
 	rm -f $lsf
-	set -f
-	ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \
-	 while read i # this could be parallel
+	lff=()
+	ii=()
+	for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4)
+	 # this could be parallel
 	 do
-	    set +f
 	    id=${p#CC-MAIN-*}-00$i
+	    echo "$id" 1>&2
 	    lf=logs/${jobid}_${i}_log
+	    lff+=("${lf}") # accumulate list of log files
+	    echo starting ${id} $(date) > $lf
 	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+	    echo finished ${id} $(date) >> $lf
 	    ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf
-	    set -f
-	    echo "*-00${i}_*"
-	 done))
+	    ii+=("*-00${i}_*")
+	    echo "$i" "${ii[@]}" ${#ii[@]} 1>&2
+	 done
 	# now compare ls vs. tar
+	echo "${ii[@]}"  ${#ii[@]} 1>&2
+	echo lff "${lff[@]}" 1>&2
 	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
 	       tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
 	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
@@ -67,15 +73,18 @@
 	if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
 	then
 	    echo "extra lines in ${e}_check.txt" 1>&2
+	    cd ..
 	    break
 	fi
 	if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
 	then
 	    echo "non-addition lines in ${e}_check.txt" 1>&2
+	    cd ..
 	    break
 	fi
 	egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt
-	set +f
+	tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
+	tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
      done
-     cd ..
+    cd ..
  done