changeset 21:a214fd3a8001

use /var/data, don't store but include subproc _inside_ tryread
author Henry S. Thompson <ht@markup.co.uk>
date Sun, 21 Oct 2018 12:41:10 +0000
parents 0f4a0f4e38d4
children 60d4042dab26
files workers/bin/timedWhich.sh
diffstat 1 files changed, 18 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/workers/bin/timedWhich.sh	Sat Oct 20 16:13:58 2018 +0000
+++ b/workers/bin/timedWhich.sh	Sun Oct 21 12:41:10 2018 +0000
@@ -7,7 +7,7 @@
 #set -e -o pipefail
 echo $$ > test1.pid
 proc=$1
-res=res$proc
+res=/var/data/res$proc
 home=$2
 shift 2
 function lrand {
@@ -28,24 +28,26 @@
 set -o pipefail
 until if [ $((m+=1)) -gt 5 ]; then echo "tried to read $1 5 times w/o success, giving up" 1>&2; return 1; fi && \
   curl -s --insecure -o - "$1"| zcat | fgrep msgtype=response | \
-{ egrep -i '"WARC-Target-URI":"https?:' || : ;} > $2
+{ egrep -i '"WARC-Target-URI":"https?:' || : ;} | doit 
  do
   # try to avoid lockstep retries
   echo ${PIPESTATUS[@]} 1>&2
   sleep $(lrand 10)
-  echo $(date) $2 retry number $m 1>&2
+  echo $(date) retry number $m 1>&2
 done
 set +o pipefail
 }
 trap "{ 
-  #set -e -o pipefail
-  ln -s ../nohup.cc .
-  tar -czhf - CC* $res | \
+  set -e -o pipefail
+  cd /var/data
+  tar -czhf - CC* res* | \
    ssh -o StrictHostKeyChecking=no -q $home \"{ cd data
                     mkdir -p which
                     cd which
                     tar -xzf - ; } 2>>errs\"
-  rm -rf $res CC* ifile.txt *.pid
+  rm -rf res* CC*
+  cd
+  rm ifile.txt *.pid
   ( sleep 5 ; rm nohup.cc ) &
   }" EXIT
 mkdir -p $res
@@ -54,24 +56,21 @@
 sleep $pause
 echo \# $(date) >  $log
 pRes=0
+doit ()
+{
+echo -n "# $(date) $id " >> $log
+tee >(wc -l >> $log) |\
+ parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
+}
 while read s id
 do
  url="https://commoncrawl.s3.amazonaws.com/$s"
- ccm=${s##*/wat/}
- cci=${ccm%%-ip*}
+ cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
  export ID=$id
  echo \# $(date) "running |$proc|$home|$pause|$wp|$id|" >> $log
  # Experimental retry loop
- tryRead "$url" crawl$id
- if [ -s crawl$id ]
- then
-  echo \# $(date) $id $(wc -l crawl$id) >> $log
-  parallel --round-robin --pipe --block-size 2M -j $wp "_timedWhich.sh {#} >> $res/$cci.{#} 2>>$res/errs{#}" < crawl$id || echo "ppfailed $? ${PIPESTATUS[@]}" 1>&2
- else
-  echo "crawl$id empty" 1>&2
- fi
- rm crawl$id
- cat $res/$cci.* > $cci
+ tryRead "$url"
+ cat $res/$cci.* > /var/data/$cci
 done < ifile.txt 2>> $res/errs || pRes=$?
 echo \# $(date) main loop exit code=$pRes >> $log
 rm .running