changeset 41:64227ff87e4e

done through re-extraction, fixing tars still to come
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 03 Apr 2020 17:35:17 +0100
parents 0d1f6c971d5e
children 8ff97ea0ba2c
files bin/reExtract.sh
diffstat 1 files changed, 14 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/bin/reExtract.sh	Thu Apr 02 19:21:21 2020 +0100
+++ b/bin/reExtract.sh	Fri Apr 03 17:35:17 2020 +0100
@@ -19,12 +19,19 @@
  do
     printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
  done > in.txt
-for s in $(cut -f 2 in.txt| sort -u)
-do
-    mkdir -p $s
+cut -f 2,3 in.txt| sort -u | \
+ while read s p
+ do
+    mkdir -p $s/logs
     cd $s
-    for e in $(egrep "\b$s\b" in.txt |cut -f 1 | sort -u)
+    # Sigh, shouldn't have used this in the extraction ...
+    jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
+	     cut -f 2 -d / cut -f 1 -d _)
+    egrep "\b$s\b" in.txt |cut -f 1,4 | \
+    while read e i
     do
-	tar -xf $h/$s/extract_$e.tar [files for p i in $(egrep "^$e\s$s\b" in.txt | cut -f 3,4)
-	
-
+	id=${p}-00$i
+	lf=logs/${jobid}_${i}_log
+	unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+    done
+ done