Mercurial > hg > cc > cirrus_home
comparison bin/preExtract.sh @ 49:18f8bcc779e8
as running, modulo 1 log output wrong
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 10 Apr 2020 18:42:08 +0100 |
parents | 307e0c44925a |
children | 5de261eb0deb |
comparison
equal
deleted
inserted
replaced
48:307e0c44925a | 49:18f8bcc779e8 |
---|---|
57 echo " " "$id" >> log | 57 echo " " "$id" >> log |
58 lf=logs/${jobid}_${i}_log | 58 lf=logs/${jobid}_${i}_log |
59 lff+=("${lf}") # accumulate list of log files | 59 lff+=("${lf}") # accumulate list of log files |
60 if [ -s $lf ] | 60 if [ -s $lf ] |
61 then | 61 then |
62 echo " " $lf empty, skipping >> log | 62 echo " " $lf not empty, skipping extraction >> log |
63 else | 63 else |
64 echo " " extracting from $id >> log | 64 echo " " extracting from $id >> log |
65 echo starting ${id} $(date) > $lf | 65 echo starting ${id} $(date) > $lf |
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | 66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf |
67 echo finished ${id} $(date) >> $lf | 67 echo finished ${id} $(date) >> $lf |
82 echo " " checking... | 82 echo " " checking... |
83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt | 83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt |
84 ni=${#ii[@]} | 84 ni=${#ii[@]} |
85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] | 85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] |
86 then | 86 then |
87 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ | 87 if [[ "$(tr -s "\n\t " " " < ${e}_check.txt)" =~ \ |
88 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] | 88 ^" "[0-9]*" > 1 < 1 --- 1 "[0-9]*c[0-9]*" 1 "[0-9]*"a"[0-9,]*" "$ ]] |
89 then | 89 then |
90 : | 90 : |
91 else | 91 else |
92 echo " " "extra lines in ${e}_check.txt" >> log | 92 echo " " "extra lines in ${e}_check.txt" >> log |
93 cd .. | 93 cd .. |