diff workers/bin/findPDFs.sh @ 13:bba589cab837

shrinkJSON.sh: minimise "jq ." output test1.sh: fix lrand regression [_]findPDFs.sh: extract responses with application/pdf Content-Type
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 12 Oct 2018 08:51:50 +0000
parents be1034183e03
children
line wrap: on
line diff
--- a/workers/bin/findPDFs.sh	Wed Oct 10 11:28:21 2018 +0000
+++ b/workers/bin/findPDFs.sh	Fri Oct 12 08:51:50 2018 +0000
@@ -7,9 +7,13 @@
 #set -e -o pipefail
 echo $$ > test1.pid
 proc=$1
-res=res
+res=res$proc
 home=$2
 shift 2
+function lrand {
+# cheap bad little random number generator
+echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
+}
 if [ "$1" = "-t" ]
 then
  shift
@@ -19,10 +23,6 @@
 fi
 wp=$1
 touch .running
-function lrand {
-# cheap bad little random number generator
-echo $(( 1 + ($(openssl rand 1 | od -d | head -1 | tr -s ' ' | cut -f 2 -d ' ') % $1)))
-}
 function tryRead {
 m=0
 set -o pipefail
@@ -39,15 +39,13 @@
 }
 trap "{ 
   #set -e -o pipefail
-  cd $res
   ln -s ../nohup.cc .
-  tar -czhf - * | \
+  tar -czhf - CC* $res | \
    ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
                     mkdir -p pdf/wat
                     cd pdf/wat
                     tar -xzf - ; } 2>>errs\"
-  cd
-  rm -rf $res
+  rm -rf $res CC* ifile.txt *.pid
   ( sleep 5 ; rm nohup.cc ) &
   }" EXIT
 mkdir -p $res
@@ -68,11 +66,12 @@
  if [ -s crawl$id ]
  then
   echo \# $(date) $id $(wc -l crawl$id) >> $log
-  parallel --round-robin --pipe -j $wp "_findPDFs.sh {#} >> $res/$cci 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
+  parallel --round-robin --pipe --block-size 2M -j $wp "_findPDFs.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
  else
   echo "crawl$id empty" 1>&2
  fi
  rm crawl$id
+ cat $res/$cci.* > $cci
 done < ifile.txt 2>> $res/errs || pRes=$?
 echo \# $(date) main loop exit code=$pRes >> $log
 rm .running