# HG changeset patch # User Henry S. Thompson # Date 1539334310 0 # Node ID bba589cab8373c753866f871057535abe8226a86 # Parent be1034183e03290f8fca825736604033aef122c6 shrinkJSON.sh: minimise "jq ." output test1.sh: fix lrand regression [_]findPDFs.sh: extract responses with application/pdf Content-Type diff -r be1034183e03 -r bba589cab837 master/bin/shrinkJSON.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/master/bin/shrinkJSON.sh Fri Oct 12 08:51:50 2018 +0000 @@ -0,0 +1,3 @@ +#!/bin/bash +# Shrink json output from worker to remove unnecessary characters +egrep -v '^#' $1| jq . |sed 's/^ *//;s/^"\([-0-9A-Za-z]*\)": "\?/\1:/;s/"\?,\?$//' diff -r be1034183e03 -r bba589cab837 workers/bin/_findPDFs.sh --- a/workers/bin/_findPDFs.sh Wed Oct 10 11:28:21 2018 +0000 +++ b/workers/bin/_findPDFs.sh Fri Oct 12 08:51:50 2018 +0000 @@ -1,4 +1,4 @@ #!/bin/bash echo "# $(date) > $ID.$1" -fgrep msgtype=response|egrep '"Headers":{[^{]*"Content-Type":"application/pdf"' | jq . +fgrep msgtype=response|egrep '"Headers":{[^{]*"Content-Type":"application/pdf"' echo "# $(date) < $ID.$1" diff -r be1034183e03 -r bba589cab837 workers/bin/findPDFs.sh --- a/workers/bin/findPDFs.sh Wed Oct 10 11:28:21 2018 +0000 +++ b/workers/bin/findPDFs.sh Fri Oct 12 08:51:50 2018 +0000 @@ -7,9 +7,13 @@ #set -e -o pipefail echo $$ > test1.pid proc=$1 -res=res +res=res$proc home=$2 shift 2 +function lrand { +# cheap bad little random number generator +echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) +} if [ "$1" = "-t" ] then shift @@ -19,10 +23,6 @@ fi wp=$1 touch .running -function lrand { -# cheap bad little random number generator -echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) -} function tryRead { m=0 set -o pipefail @@ -39,15 +39,13 @@ } trap "{ #set -e -o pipefail - cd $res ln -s ../nohup.cc . - tar -czhf - * | \ + tar -czhf - CC* $res | \ ssh -o StrictHostKeyChecking=no -q $home \"{ cd data mkdir -p pdf/wat cd pdf/wat tar -xzf - ; } 2>>errs\" - cd - rm -rf $res + rm -rf $res CC* ifile.txt *.pid ( sleep 5 ; rm nohup.cc ) & }" EXIT mkdir -p $res @@ -68,11 +66,12 @@ if [ -s crawl$id ] then echo \# $(date) $id $(wc -l crawl$id) >> $log - parallel --round-robin --pipe -j $wp "_findPDFs.sh {#} >> $res/$cci 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 + parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2 else echo "crawl$id empty" 1>&2 fi rm crawl$id + cat $res/$cci.* > $cci done < ifile.txt 2>> $res/errs || pRes=$? echo \# $(date) main loop exit code=$pRes >> $log rm .running diff -r be1034183e03 -r bba589cab837 workers/bin/test1.sh --- a/workers/bin/test1.sh Wed Oct 10 11:28:21 2018 +0000 +++ b/workers/bin/test1.sh Fri Oct 12 08:51:50 2018 +0000 @@ -10,6 +10,10 @@ res=res$proc home=$2 shift 2 +function lrand { +# cheap bad little random number generator +echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) +} if [ "$1" = "-t" ] then shift @@ -19,10 +23,6 @@ fi wp=$1 touch .running -function lrand { -# cheap bad little random number generator -echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1))) -} function tryRead { m=0 set -o pipefail