diff bin/reExtract.sh @ 42:8ff97ea0ba2c

refactored, not tested
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 03 Apr 2020 19:04:06 +0100
parents 64227ff87e4e
children 4b574613200c
line wrap: on
line diff
--- a/bin/reExtract.sh	Fri Apr 03 17:35:17 2020 +0100
+++ b/bin/reExtract.sh	Fri Apr 03 19:04:06 2020 +0100
@@ -26,12 +26,18 @@
     cd $s
     # Sigh, shouldn't have used this in the extraction ...
     jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
-	     cut -f 2 -d / cut -f 1 -d _)
-    egrep "\b$s\b" in.txt |cut -f 1,4 | \
-    while read e i
-    do
-	id=${p}-00$i
-	lf=logs/${jobid}_${i}_log
-	unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
-    done
+	     cut -f 2 -d / |cut -f 1 -d _)
+    egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
+     while read e # this could be parallel
+     do
+	egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \
+	 while read i # this could be parallel
+	 do
+	    id=${p#CC-MAIN-*}-00$i
+	    lf=logs/${jobid}_${i}_log
+	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+	 done
+	# now compare (logs? ls vs. tar -tvf ?)
+     done
+     cd ..
  done