comparison bin/preExtract.sh @ 49:18f8bcc779e8

as running, modulo 1 log output wrong
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 10 Apr 2020 18:42:08 +0100
parents 307e0c44925a
children 5de261eb0deb
comparison
equal deleted inserted replaced
48:307e0c44925a 49:18f8bcc779e8
57 echo " " "$id" >> log 57 echo " " "$id" >> log
58 lf=logs/${jobid}_${i}_log 58 lf=logs/${jobid}_${i}_log
59 lff+=("${lf}") # accumulate list of log files 59 lff+=("${lf}") # accumulate list of log files
60 if [ -s $lf ] 60 if [ -s $lf ]
61 then 61 then
62 echo " " $lf empty, skipping >> log 62 echo " " $lf not empty, skipping extraction >> log
63 else 63 else
64 echo " " extracting from $id >> log 64 echo " " extracting from $id >> log
65 echo starting ${id} $(date) > $lf 65 echo starting ${id} $(date) > $lf
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf 66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
67 echo finished ${id} $(date) >> $lf 67 echo finished ${id} $(date) >> $lf
82 echo " " checking... 82 echo " " checking...
83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt 83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
84 ni=${#ii[@]} 84 ni=${#ii[@]}
85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] 85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
86 then 86 then
87 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ 87 if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \
88 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] 88 ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]]
89 then 89 then
90 : 90 :
91 else 91 else
92 echo " " "extra lines in ${e}_check.txt" >> log 92 echo " " "extra lines in ${e}_check.txt" >> log
93 cd .. 93 cd ..