changeset 13:bba589cab837

shrinkJSON.sh: minimise "jq ." output test1.sh: fix lrand regression [_]findPDFs.sh: extract responses with application/pdf Content-Type
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 12 Oct 2018 08:51:50 +0000
parents be1034183e03
children c1c8275bd194
files master/bin/shrinkJSON.sh workers/bin/_findPDFs.sh workers/bin/findPDFs.sh workers/bin/test1.sh
diffstat 4 files changed, 17 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/master/bin/shrinkJSON.sh	Fri Oct 12 08:51:50 2018 +0000
@@ -0,0 +1,3 @@
+#!/bin/bash
+# Shrink json output from worker to remove unnecessary characters
+egrep -v '^#' $1| jq  .  |sed 's/^  *//;s/^"\([-0-9A-Za-z]*\)": "\?/\1:/;s/"\?,\?$//'
--- a/workers/bin/_findPDFs.sh	Wed Oct 10 11:28:21 2018 +0000
+++ b/workers/bin/_findPDFs.sh	Fri Oct 12 08:51:50 2018 +0000
@@ -1,4 +1,4 @@
 #!/bin/bash
 echo "# $(date) > $ID.$1"
-fgrep msgtype=response|egrep '"Headers":{[^{]*"Content-Type":"application/pdf"' | jq .
+fgrep msgtype=response|egrep '"Headers":{[^{]*"Content-Type":"application/pdf"'
 echo "# $(date) < $ID.$1"
--- a/workers/bin/findPDFs.sh	Wed Oct 10 11:28:21 2018 +0000
+++ b/workers/bin/findPDFs.sh	Fri Oct 12 08:51:50 2018 +0000
@@ -7,9 +7,13 @@
 #set -e -o pipefail
 echo $$ > test1.pid
 proc=$1
-res=res
+res=res$proc
 home=$2
 shift 2
+function lrand {
+# cheap bad little random number generator
+echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
+}
 if [ "$1" = "-t" ]
 then
  shift
@@ -19,10 +23,6 @@
 fi
 wp=$1
 touch .running
-function lrand {
-# cheap bad little random number generator
-echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
-}
 function tryRead {
 m=0
 set -o pipefail
@@ -39,15 +39,13 @@
 }
 trap "{ 
   #set -e -o pipefail
-  cd $res
   ln -s ../nohup.cc .
-  tar -czhf - * | \
+  tar -czhf - CC* $res | \
    ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
                     mkdir -p pdf/wat
                     cd pdf/wat
                     tar -xzf - ; } 2>>errs\"
-  cd
-  rm -rf $res
+  rm -rf $res CC* ifile.txt *.pid
   ( sleep 5 ; rm nohup.cc ) &
   }" EXIT
 mkdir -p $res
@@ -68,11 +66,12 @@
  if [ -s crawl$id ]
  then
   echo \# $(date) $id $(wc -l crawl$id) >> $log
-  parallel --round-robin --pipe -j $wp "_findPDFs.sh {#} >> $res/$cci 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
+  parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
  else
   echo "crawl$id empty" 1>&2
  fi
  rm crawl$id
+ cat $res/$cci.* > $cci
 done < ifile.txt 2>> $res/errs || pRes=$?
 echo \# $(date) main loop exit code=$pRes >> $log
 rm .running
--- a/workers/bin/test1.sh	Wed Oct 10 11:28:21 2018 +0000
+++ b/workers/bin/test1.sh	Fri Oct 12 08:51:50 2018 +0000
@@ -10,6 +10,10 @@
 res=res$proc
 home=$2
 shift 2
+function lrand {
+# cheap bad little random number generator
+echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
+}
 if [ "$1" = "-t" ]
 then
  shift
@@ -19,10 +23,6 @@
 fi
 wp=$1
 touch .running
-function lrand {
-# cheap bad little random number generator
-echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
-}
 function tryRead {
 m=0
 set -o pipefail