changeset 47:2a0dab424418

cci path hack changed for 2018.04
author Henry S. Thompson <ht@markup.co.uk>
date Mon, 10 Dec 2018 14:43:18 +0000
parents 7a4e49689935
children 3b951980206d
files workers/bin/ptimedWhich.sh
diffstat 1 files changed, 2 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/workers/bin/ptimedWhich.sh	Mon Dec 03 21:10:02 2018 +0000
+++ b/workers/bin/ptimedWhich.sh	Mon Dec 10 14:43:18 2018 +0000
@@ -66,7 +66,8 @@
 while read s
 do
  url="https://commoncrawl.s3.amazonaws.com/$s"
- cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$13}' |tr ' ' \-)
+ # below for 2018-04, for 2017-04 needs $13 instead of $14
+ cci=$(echo $s | tr '/-' ' ' | awk '{print $3,$4,$8,$14}' |tr ' ' \-)
  echo $url /var/data/$cci
 done < ifile.txt 2>> $res/errs | \
  parallel --pipe -N$((N / wp)) -j $wp "bash -c \"tryread 2>>$res/errs{#}\"" 2>>$res/errs || pRes=$?