comparison bin/preExtract.sh @ 48:307e0c44925a

log more, work around more glitches
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 10 Apr 2020 18:22:48 +0100
parents 2e5b3439a2ed
children 18f8bcc779e8
comparison
equal deleted inserted replaced
47:81ff28478276 48:307e0c44925a
36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } 36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; }
37 s={1} 37 s={1}
38 p={2} 38 p={2}
39 mkdir -p $s/logs 39 mkdir -p $s/logs
40 cd $s 40 cd $s
41 echo $(date) starting $s/$p > log
41 # Sigh, should not have used this in the extraction ... 42 # Sigh, should not have used this in the extraction ...
42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ 43 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
43 cut -f 2 -d / |cut -f 1 -d _) 44 cut -f 2 -d / |cut -f 1 -d _)
44 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) 45 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
45 # this could be parallel 46 # this could be parallel
46 do 47 do
48 echo $(date) begin extract: $e >> log
47 lsf=lsl${e}.txt 49 lsf=lsl${e}.txt
48 rm -f $lsf 50 rm -f $lsf
49 lff=() 51 lff=()
50 ii=() 52 ii=()
51 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) 53 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4)
52 # this could be parallel 54 # this could be parallel
53 do 55 do
54 id=${p#CC-MAIN-*}-00$i 56 id=${p#CC-MAIN-*}-00$i
55 echo "$id" 1>&2 57 echo " " "$id" >> log
56 lf=logs/${jobid}_${i}_log 58 lf=logs/${jobid}_${i}_log
57 lff+=("${lf}") # accumulate list of log files 59 lff+=("${lf}") # accumulate list of log files
58 #echo starting ${id} $(date) > $lf 60 if [ -s $lf ]
59 #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf 61 then
60 #echo finished ${id} $(date) >> $lf 62 echo " " $lf empty, skipping >> log
63 else
64 echo " " extracting from $id >> log
65 echo starting ${id} $(date) > $lf
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
67 echo finished ${id} $(date) >> $lf
68 fi
61 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf 69 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
62 ii+=("*-00${i}_*") 70 ii+=("*-00${i}_*")
63 echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 71 echo " " "$i" "${ii[@]}" ${#ii[@]} >> log
64 done 72 done
73 echo " " extractions done
65 # now compare ls vs. tar 74 # now compare ls vs. tar
66 echo "${ii[@]}" ${#ii[@]} 1>&2 75 echo " " "${ii[@]}" ${#ii[@]} >> log
67 echo lff "${lff[@]}" 1>&2 76 echo " " lff "${lff[@]}" >> log
68 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ 77 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
69 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ 78 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
70 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt 79 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
71 if [ -s ${e}_diff.txt ] 80 if [ -s ${e}_diff.txt ]
72 then 81 then
82 echo " " checking...
73 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt 83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
74 ni=${#ii[@]} 84 ni=${#ii[@]}
75 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] 85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
76 then 86 then
77 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ 87 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \
78 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] 88 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]]
79 then 89 then
80 : 90 :
81 else 91 else
82 echo "extra lines in ${e}_check.txt" 1>&2 92 echo " " "extra lines in ${e}_check.txt" >> log
83 cd .. 93 cd ..
84 continue 94 continue
85 fi 95 fi
86 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] 96 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
87 then 97 then
88 echo "non-addition lines in ${e}_check.txt" 1>&2 98 echo " " "non-addition lines in ${e}_check.txt" >> log
89 cd .. 99 cd ..
90 continue 100 continue
91 fi 101 fi
102 echo " " starting tar update
92 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt 103 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
93 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" 104 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
94 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" 105 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
95 else 106 else
96 echo "no diff, no update" $e 107 echo "no diff, no update" $e >> log
97 fi 108 fi
109 echo end extract: $e >> log
98 done 110 done
99 cd .. 111 cd ..
100 ' 112 '