Mercurial > hg > cc > cirrus_home
comparison bin/preExtract.sh @ 48:307e0c44925a
log more, work around more glitches
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 10 Apr 2020 18:22:48 +0100 |
parents | 2e5b3439a2ed |
children | 18f8bcc779e8 |
comparison
equal
deleted
inserted
replaced
47:81ff28478276 | 48:307e0c44925a |
---|---|
36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } | 36 function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } |
37 s={1} | 37 s={1} |
38 p={2} | 38 p={2} |
39 mkdir -p $s/logs | 39 mkdir -p $s/logs |
40 cd $s | 40 cd $s |
41 echo $(date) starting $s/$p > log | |
41 # Sigh, should not have used this in the extraction ... | 42 # Sigh, should not have used this in the extraction ... |
42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ | 43 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
43 cut -f 2 -d / |cut -f 1 -d _) | 44 cut -f 2 -d / |cut -f 1 -d _) |
44 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) | 45 for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) |
45 # this could be parallel | 46 # this could be parallel |
46 do | 47 do |
48 echo $(date) begin extract: $e >> log | |
47 lsf=lsl${e}.txt | 49 lsf=lsl${e}.txt |
48 rm -f $lsf | 50 rm -f $lsf |
49 lff=() | 51 lff=() |
50 ii=() | 52 ii=() |
51 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) | 53 for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) |
52 # this could be parallel | 54 # this could be parallel |
53 do | 55 do |
54 id=${p#CC-MAIN-*}-00$i | 56 id=${p#CC-MAIN-*}-00$i |
55 echo "$id" 1>&2 | 57 echo " " "$id" >> log |
56 lf=logs/${jobid}_${i}_log | 58 lf=logs/${jobid}_${i}_log |
57 lff+=("${lf}") # accumulate list of log files | 59 lff+=("${lf}") # accumulate list of log files |
58 #echo starting ${id} $(date) > $lf | 60 if [ -s $lf ] |
59 #unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | 61 then |
60 #echo finished ${id} $(date) >> $lf | 62 echo " " $lf empty, skipping >> log |
63 else | |
64 echo " " extracting from $id >> log | |
65 echo starting ${id} $(date) > $lf | |
66 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | |
67 echo finished ${id} $(date) >> $lf | |
68 fi | |
61 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf | 69 ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf |
62 ii+=("*-00${i}_*") | 70 ii+=("*-00${i}_*") |
63 echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 | 71 echo " " "$i" "${ii[@]}" ${#ii[@]} >> log |
64 done | 72 done |
73 echo " " extractions done | |
65 # now compare ls vs. tar | 74 # now compare ls vs. tar |
66 echo "${ii[@]}" ${#ii[@]} 1>&2 | 75 echo " " "${ii[@]}" ${#ii[@]} >> log |
67 echo lff "${lff[@]}" 1>&2 | 76 echo " " lff "${lff[@]}" >> log |
68 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ | 77 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ |
69 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ | 78 tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ |
70 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt | 79 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt |
71 if [ -s ${e}_diff.txt ] | 80 if [ -s ${e}_diff.txt ] |
72 then | 81 then |
82 echo " " checking... | |
73 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt | 83 cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt |
74 ni=${#ii[@]} | 84 ni=${#ii[@]} |
75 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] | 85 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] |
76 then | 86 then |
77 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ | 87 if [[ "$(tr -s '\n\t ' ' ' < ${e}_check.txt)" =~ \ |
78 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] | 88 ^' '[0-9]*' > 1 < 1 --- 1 '[0-9]*c[0-9]*' 1 '[0-9]*'a'[0-9,]*' '$ ]] |
79 then | 89 then |
80 : | 90 : |
81 else | 91 else |
82 echo "extra lines in ${e}_check.txt" 1>&2 | 92 echo " " "extra lines in ${e}_check.txt" >> log |
83 cd .. | 93 cd .. |
84 continue | 94 continue |
85 fi | 95 fi |
86 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] | 96 elif [ $(fgrep -c a ${e}_check.txt) -ne $ni ] |
87 then | 97 then |
88 echo "non-addition lines in ${e}_check.txt" 1>&2 | 98 echo " " "non-addition lines in ${e}_check.txt" >> log |
89 cd .. | 99 cd .. |
90 continue | 100 continue |
91 fi | 101 fi |
102 echo " " starting tar update | |
92 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt | 103 egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt |
93 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" | 104 tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" |
94 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" | 105 tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" |
95 else | 106 else |
96 echo "no diff, no update" $e | 107 echo "no diff, no update" $e >> log |
97 fi | 108 fi |
109 echo end extract: $e >> log | |
98 done | 110 done |
99 cd .. | 111 cd .. |
100 ' | 112 ' |