changeset 45:bd0010ac88ce

parallelised version of reExtract.sh
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 08 Apr 2020 11:27:33 +0100
parents abc1b05996c9
children 2e5b3439a2ed
files bin/preExtract.sh
diffstat 1 files changed, 90 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/preExtract.sh	Wed Apr 08 11:27:33 2020 +0100
@@ -0,0 +1,90 @@
+#!/usr/bin/bash
+# Usage: reExtract.sh 20..-.. < files...
+# Input is list of paths to relative path of warc files
+#  under /beegfs/common_crawl/CC-MAIN-$1
+
+function sus () 
+{ 
+    sort "$@" | uniq -c | sort -k1nr,1
+}
+
+function edex () {
+    echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1))
+    }
+
+function join_by () {
+    # Courtesy of https://stackoverflow.com/a/17841619/2595465
+    local d=$1
+    shift
+    echo -n "$1"
+    shift
+    printf "%s" "${@/#/$d}"
+}
+
+h=/beegfs/common_crawl/CC-MAIN-$1
+
+mkdir -p /dev/shm/rex
+cd /dev/shm/rex
+
+sed 's/-00/ /;s/.warc.gz//' | \
+ while read s p i
+ do
+    printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
+ done > in.txt
+cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\
+ parallel --will-cite -j 16 -N 2 h="$h"'
+    function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; }
+    s={1}
+    p={2}
+    mkdir -p $s/logs
+    cd $s
+    # Sigh, should not have used this in the extraction ...
+    jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
+	     cut -f 2 -d / |cut -f 1 -d _)
+    for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
+     # this could be parallel
+     do
+	lsf=lsl${e}.txt
+	rm -f $lsf
+	lff=()
+	ii=()
+	for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4)
+	 # this could be parallel
+	 do
+	    id=${p#CC-MAIN-*}-00$i
+	    echo "$id" 1>&2
+	    lf=logs/${jobid}_${i}_log
+	    lff+=("${lf}") # accumulate list of log files
+	    echo starting ${id} $(date) > $lf
+	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
+	    echo finished ${id} $(date) >> $lf
+	    ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
+	    ii+=("*-00${i}_*")
+	    echo "$i" "${ii[@]}" ${#ii[@]} 1>&2
+	 done
+	# now compare ls vs. tar
+	echo "${ii[@]}"  ${#ii[@]} 1>&2
+	echo lff "${lff[@]}" 1>&2
+	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
+	       tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
+	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
+        cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
+	ni=${#ii[@]}
+	if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
+	then
+	    echo "extra lines in ${e}_check.txt" 1>&2
+	    cd ..
+	    break
+	fi
+	if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
+	then
+	    echo "non-addition lines in ${e}_check.txt" 1>&2
+	    cd ..
+	    break
+	fi
+	egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
+	tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
+	tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
+     done
+    cd ..
+'