changeset 0:fdd3f8a16fd4 default tip

shared scripts on valhalla cluster
author Henry Thompson <ht@markup.co.uk>
date Sat, 14 Mar 2020 11:00:58 +0000
parents
children
files bin/driver.sh bin/extract.sh bin/hack2.sh bin/setup bin/warc.sh
diffstat 5 files changed, 209 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/driver.sh	Sat Mar 14 11:00:58 2020 +0000
@@ -0,0 +1,2 @@
+#!/bin/bash
+cat $1 | parallel -j40 -N1 --pipe shared/bin/extract.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/extract.sh	Sat Mar 14 11:00:58 2020 +0000
@@ -0,0 +1,11 @@
+#!/bin/bash
+me=$$
+SHARED=/home/shared/ht
+cd $SHARED/data/$(hostname)
+mkdir -p logs
+while read id
+do
+    echo starting ${id} $(date) >> logs/${me}_log
+    unpigz -dp 1 -c /data/common_crawl/CC-MAIN-2019-35/CC-MAIN-${id}.warc.gz|$SHARED/bin/warc.sh ${id} application/pdf 2>> logs/${me}_log
+    echo finished ${id} $(date) >> logs/${me}_log
+done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/hack2.sh	Sat Mar 14 11:00:58 2020 +0000
@@ -0,0 +1,2 @@
+#!/bin/bash
+cat $1 | parallel -j40 -N1 --pipe '{ sleep 1 ; hostname ; cut -f 39 -d \  /proc/$$/stat ; cat ; date +"%M:%S" ; }|paste -sd:'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/setup	Sat Mar 14 11:00:58 2020 +0000
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+SHARED=/home/shared/ht
+
+link () {
+    rm -f "$2"
+    ln -s "$1" "$2"
+}
+
+link $SHARED $HOME/shared
+link $SHARED/.profile $HOME/.profile
+link $SHARED/.ia $HOME/.ia
+
+mkdir -p $HOME/.ssh
+chmod 700 $HOME/.ssh
+
+if [ -f $HOME/.ssh/authorized_keys ]
+then
+    if cmp -s $SHARED/.ssh/authorized_keys $HOME/.ssh/authorized_keys
+    then
+	:
+    else
+	cat $SHARED/.ssh/authorized_keys >> $HOME/.ssh/authorized_keys
+    fi
+else
+    cp $SHARED/.ssh/authorized_keys $HOME/.ssh/authorized_keys
+    chmod 600 $HOME/.ssh/authorized_keys
+fi
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/warc.sh	Sat Mar 14 11:00:58 2020 +0000
@@ -0,0 +1,164 @@
+#!/bin/bash
+# Try to fillet warc payloads with just a shell script
+# Usage warc.sh outfilePrefix [-n startnum]
+
+LANG=C # count bytes
+LC_ALL=C # count bytes
+IFS=$'\n'
+shopt -qs nocasematch
+shopt -qs extglob
+
+handle_body () {
+    ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature'
+    l=$1
+    head -c $l
+    r=$?
+    if [ $r -ne 0 ]; then
+	echo "truncated \$? = $r" 1>&2
+    fi
+}   
+
+handle_payload () {
+    n=$1
+    l=$2
+    ol=$2
+    f=$3
+    tr=$4
+    tu="$5"
+    t=' Unknown'
+    unset z
+    unset bl
+    unset xl
+    unset hdr
+    hn=0
+    while read -r L; do
+	((l = l - (${#L} + 1)))
+	#((tot = tot + (${#L} + 1)))
+	#echo p $l 1>&2
+	hdr="${hdr}"$'\n'"${L%%
}"
+	((hn+=1))
+	case "$L" in
+	    Content-Type:\ *) t=${L##*: }
+			    t=${t%%;*}
+			    t=${t%%*([
[:space:]])}
+                            #echo $t 1>&2
+                            ;;
+	    Content-Length:\ *) bl=${L##*: }
+			      bl=${bl%%*([
[:space:]])}
+			      ;;
+	    X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018???
+			      xl=${xl%%*([
[:space:]])}
+			      ;;
+	    X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018???
+		              z=${L##*: }
+			      ((cec[${z%%*([
[:space:]])}]+=1))
+		                    ;;
+	    
) if [ $l -gt 0 ]; then
+		    if [[ "$f" && ( "$f" != "$t" ) ]]; then
+			echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2
+			head -c $l >/dev/null
+			return
+		    fi
+		    if [ "$xl" ]; then
+			bl=$xl
+			xx=x
+		    else
+			unset xx
+	            fi
+		    case "$t" in
+			application/pdf) s=.pdf ;;
+			text/html) s=.html ;;
+			*) s=''
+		    esac
+		    if [ "$bl" ]; then
+			if [ $bl -ne $l -a -z "$z" ]; then
+			    echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2
+			fi
+		    fi
+		    echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2
+		    { echo "$hdr" | head -$((hn-1)) | tail -n +2
+		      if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi
+                      echo "X-HST-Target-URI: $tu"
+		    } > ${pprefix}_$n.hdr #
+		    handle_body $l  > ${pprefix}_$n$s
+		else
+		    echo "empty body, skipping" 1>&2
+		fi
+		return;;
+        esac
+    done
+}
+
+handle_resp () {
+    n=$1
+    f=$2
+    unset tr
+    while read -r L; do
+	tot=$((tot + ${#L} + 1))
+	case "$L" in
+	    Content-Length:\ *) l=${L##*: }
+                              #surrounding spaces don't matter for arithmetic
+                              ;;
+	    WARC-Truncated:\ *) # echo $n $L
+		              tr=${L##*: }
+		              tr=${tr%%*([
[:space:]])}
+			      tr=${tr:-EMPTY}
+		                    ;;
+	    WARC-Target-URI:\ *) tu=${L##*: }
+		              tu=${tu%%*([
[:space:]])}
+			      # echo "|$L|$tu|"
+			      ;;
+	    
) ll=${l%%*([
[:space:]])} # but the \r has to go
+		#echo "h_p at $tot" 1>&2
+		#echo "|$tu|${tu# }|"
+		handle_payload $n $ll "$f" "${tr# }" "${tu# }"
+		tot=$((tot + ll))
+		#echo "h_p done: $tot" 1>&2
+		return
+		;;
+        esac
+    done
+}
+
+# outer loop
+pprefix="$1"
+shift
+if [ "$1" = "-n" ]; then
+    n=$2
+    shift; shift
+else
+    n=0
+fi
+tot=0
+c=0
+f=$1
+wc=0
+declare -A cec
+while read -r L; do
+    tot=$((tot + ${#L} + 1))
+    case ${L%
} in
+	WARC/1.0)
+	    if [ $wc -eq 0 -a $c -gt 0 ]; then
+		echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2
+	    fi
+	    ((wc++))
+	    ;;
+	"")
+	    :
+	    ;;
+	WARC-Type:\ response)
+	    echo tot at resp prop: $tot 1>&2
+	    handle_resp $((n = n + 1)) $f
+	    c=0
+	    wc=0
+	    ;;
+	*)
+            c=$((c + 1))
+            ;;
+    esac
+done
+echo "Last response #: $n" 1>&2
+echo "Compression stats:" 1>&2
+for i in "${!cec[@]}"; do
+    printf " %10s: %s\n" $i ${cec[$i]} 1>&2
+done