Mercurial > hg > cc > valhalla
changeset 0:fdd3f8a16fd4 default tip
shared scripts on valhalla cluster
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 14 Mar 2020 11:00:58 +0000 |
parents | |
children | |
files | bin/driver.sh bin/extract.sh bin/hack2.sh bin/setup bin/warc.sh |
diffstat | 5 files changed, 209 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/driver.sh Sat Mar 14 11:00:58 2020 +0000 @@ -0,0 +1,2 @@ +#!/bin/bash +cat $1 | parallel -j40 -N1 --pipe shared/bin/extract.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/extract.sh Sat Mar 14 11:00:58 2020 +0000 @@ -0,0 +1,11 @@ +#!/bin/bash +me=$$ +SHARED=/home/shared/ht +cd $SHARED/data/$(hostname) +mkdir -p logs +while read id +do + echo starting ${id} $(date) >> logs/${me}_log + unpigz -dp 1 -c /data/common_crawl/CC-MAIN-2019-35/CC-MAIN-${id}.warc.gz|$SHARED/bin/warc.sh ${id} application/pdf 2>> logs/${me}_log + echo finished ${id} $(date) >> logs/${me}_log +done
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/hack2.sh Sat Mar 14 11:00:58 2020 +0000 @@ -0,0 +1,2 @@ +#!/bin/bash +cat $1 | parallel -j40 -N1 --pipe '{ sleep 1 ; hostname ; cut -f 39 -d \ /proc/$$/stat ; cat ; date +"%M:%S" ; }|paste -sd:'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/setup Sat Mar 14 11:00:58 2020 +0000 @@ -0,0 +1,30 @@ +#!/bin/bash + +SHARED=/home/shared/ht + +link () { + rm -f "$2" + ln -s "$1" "$2" +} + +link $SHARED $HOME/shared +link $SHARED/.profile $HOME/.profile +link $SHARED/.ia $HOME/.ia + +mkdir -p $HOME/.ssh +chmod 700 $HOME/.ssh + +if [ -f $HOME/.ssh/authorized_keys ] +then + if cmp -s $SHARED/.ssh/authorized_keys $HOME/.ssh/authorized_keys + then + : + else + cat $SHARED/.ssh/authorized_keys >> $HOME/.ssh/authorized_keys + fi +else + cp $SHARED/.ssh/authorized_keys $HOME/.ssh/authorized_keys + chmod 600 $HOME/.ssh/authorized_keys +fi + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/warc.sh Sat Mar 14 11:00:58 2020 +0000 @@ -0,0 +1,164 @@ +#!/bin/bash +# Try to fillet warc payloads with just a shell script +# Usage warc.sh outfilePrefix [-n startnum] + +LANG=C # count bytes +LC_ALL=C # count bytes +IFS=$'\n' +shopt -qs nocasematch +shopt -qs extglob + +handle_body () { + ## read -r -N $l L doesn't work for binary bodies that contain a \000 because of Bash 'feature' + l=$1 + head -c $l + r=$? + if [ $r -ne 0 ]; then + echo "truncated \$? = $r" 1>&2 + fi +} + +handle_payload () { + n=$1 + l=$2 + ol=$2 + f=$3 + tr=$4 + tu="$5" + t=' Unknown' + unset z + unset bl + unset xl + unset hdr + hn=0 + while read -r L; do + ((l = l - (${#L} + 1))) + #((tot = tot + (${#L} + 1))) + #echo p $l 1>&2 + hdr="${hdr}"$'\n'"${L%% }" + ((hn+=1)) + case "$L" in + Content-Type:\ *) t=${L##*: } + t=${t%%;*} + t=${t%%*([ [:space:]])} + #echo $t 1>&2 + ;; + Content-Length:\ *) bl=${L##*: } + bl=${bl%%*([ [:space:]])} + ;; + X-Crawler-Content-Length:\ *) xl=${L##*: } # introduced btw 2015&2018??? + xl=${xl%%*([ [:space:]])} + ;; + X-Crawler-Content-Encoding:\ *|Content-Encoding:\ *) # one or the other, change btw 2015&2018??? + z=${L##*: } + ((cec[${z%%*([ [:space:]])}]+=1)) + ;; + ) if [ $l -gt 0 ]; then + if [[ "$f" && ( "$f" != "$t" ) ]]; then + echo "$t" \!= "$f", skipping starting at $((tot + (ol - l))) 1>&2 + head -c $l >/dev/null + return + fi + if [ "$xl" ]; then + bl=$xl + xx=x + else + unset xx + fi + case "$t" in + application/pdf) s=.pdf ;; + text/html) s=.html ;; + *) s='' + esac + if [ "$bl" ]; then + if [ $bl -ne $l -a -z "$z" ]; then + echo length mismatch$xx: $n here: $l given: $bl trunc: $tr 1>&2 + fi + fi + echo "reading $l bytes into ${pprefix}_$n$s as $t starting at $((tot + (ol - l)))" 1>&2 + { echo "$hdr" | head -$((hn-1)) | tail -n +2 + if [ "$tr" ] ; then echo "X-HST-Truncated: $tr"; fi + echo "X-HST-Target-URI: $tu" + } > ${pprefix}_$n.hdr # + handle_body $l > ${pprefix}_$n$s + else + echo "empty body, skipping" 1>&2 + fi + return;; + esac + done +} + +handle_resp () { + n=$1 + f=$2 + unset tr + while read -r L; do + tot=$((tot + ${#L} + 1)) + case "$L" in + Content-Length:\ *) l=${L##*: } + #surrounding spaces don't matter for arithmetic + ;; + WARC-Truncated:\ *) # echo $n $L + tr=${L##*: } + tr=${tr%%*([ [:space:]])} + tr=${tr:-EMPTY} + ;; + WARC-Target-URI:\ *) tu=${L##*: } + tu=${tu%%*([ [:space:]])} + # echo "|$L|$tu|" + ;; + ) ll=${l%%*([ [:space:]])} # but the \r has to go + #echo "h_p at $tot" 1>&2 + #echo "|$tu|${tu# }|" + handle_payload $n $ll "$f" "${tr# }" "${tu# }" + tot=$((tot + ll)) + #echo "h_p done: $tot" 1>&2 + return + ;; + esac + done +} + +# outer loop +pprefix="$1" +shift +if [ "$1" = "-n" ]; then + n=$2 + shift; shift +else + n=0 +fi +tot=0 +c=0 +f=$1 +wc=0 +declare -A cec +while read -r L; do + tot=$((tot + ${#L} + 1)) + case ${L% } in + WARC/1.0) + if [ $wc -eq 0 -a $c -gt 0 ]; then + echo "WARC/1.0 after $c non-blank lines record $n char $tot" 1>&2 + fi + ((wc++)) + ;; + "") + : + ;; + WARC-Type:\ response) + echo tot at resp prop: $tot 1>&2 + handle_resp $((n = n + 1)) $f + c=0 + wc=0 + ;; + *) + c=$((c + 1)) + ;; + esac +done +echo "Last response #: $n" 1>&2 +echo "Compression stats:" 1>&2 +for i in "${!cec[@]}"; do + printf " %10s: %s\n" $i ${cec[$i]} 1>&2 +done