changeset 43:4b574613200c

added computation of required additions to tar file, but not actually added
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 04 Apr 2020 15:31:58 +0100
parents 8ff97ea0ba2c
children abc1b05996c9
files bin/reExtract.sh
diffstat 1 files changed, 41 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/bin/reExtract.sh	Fri Apr 03 19:04:06 2020 +0100
+++ b/bin/reExtract.sh	Sat Apr 04 15:31:58 2020 +0100
@@ -3,12 +3,26 @@
 # Input is list of paths to relative path of warc files
 #  under /beegfs/common_crawl/CC-MAIN-$1
 
+function sus () 
+{ 
+    sort "$@" | uniq -c | sort -k1nr,1
+}
+
 function edex () {
     echo "$1" | \
 	cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
 	echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
     }
 
+function join_by () {
+    # Courtesy of https://stackoverflow.com/a/17841619/2595465
+    local d=$1
+    shift
+    echo -n "$1"
+    shift
+    printf "%s" "${@/#/$d}"
+}
+
 h=/beegfs/common_crawl/CC-MAIN-$1
 
 mkdir -p /dev/shm/rex
@@ -30,14 +44,38 @@
     egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
      while read e # this could be parallel
      do
-	egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \
+	lsf=lsl${e}.txt
+	rm -f $lsf
+	set -f
+	ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \
 	 while read i # this could be parallel
 	 do
+	    set +f
 	    id=${p#CC-MAIN-*}-00$i
 	    lf=logs/${jobid}_${i}_log
 	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
-	 done
-	# now compare (logs? ls vs. tar -tvf ?)
+	    ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf
+	    set -f
+	    echo "*-00${i}_*"
+	 done))
+	# now compare ls vs. tar
+	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
+	       tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
+	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
+        cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt
+	ni=${#ii[@]}
+	if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
+	then
+	    echo "extra lines in ${e}_check.txt" 1>&2
+	    break
+	fi
+	if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
+	then
+	    echo "non-addition lines in ${e}_check.txt" 1>&2
+	    break
+	fi
+	egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt
+	set +f
      done
      cd ..
  done