comparison bin/reExtract.sh @ 43:4b574613200c

added computation of required additions to tar file, but not actually added
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 04 Apr 2020 15:31:58 +0100
parents 8ff97ea0ba2c
children abc1b05996c9
comparison
equal deleted inserted replaced
42:8ff97ea0ba2c 43:4b574613200c
1 #!/usr/bin/bash 1 #!/usr/bin/bash
2 # Usage: reExtract.sh 20..-.. < files... 2 # Usage: reExtract.sh 20..-.. < files...
3 # Input is list of paths to relative path of warc files 3 # Input is list of paths to relative path of warc files
4 # under /beegfs/common_crawl/CC-MAIN-$1 4 # under /beegfs/common_crawl/CC-MAIN-$1
5
6 function sus ()
7 {
8 sort "$@" | uniq -c | sort -k1nr,1
9 }
5 10
6 function edex () { 11 function edex () {
7 echo "$1" | \ 12 echo "$1" | \
8 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ 13 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
9 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) 14 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
10 } 15 }
16
17 function join_by () {
18 # Courtesy of https://stackoverflow.com/a/17841619/2595465
19 local d=$1
20 shift
21 echo -n "$1"
22 shift
23 printf "%s" "${@/#/$d}"
24 }
11 25
12 h=/beegfs/common_crawl/CC-MAIN-$1 26 h=/beegfs/common_crawl/CC-MAIN-$1
13 27
14 mkdir -p /dev/shm/rex 28 mkdir -p /dev/shm/rex
15 cd /dev/shm/rex 29 cd /dev/shm/rex
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ 42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
29 cut -f 2 -d / |cut -f 1 -d _) 43 cut -f 2 -d / |cut -f 1 -d _)
30 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ 44 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
31 while read e # this could be parallel 45 while read e # this could be parallel
32 do 46 do
33 egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ 47 lsf=lsl${e}.txt
48 rm -f $lsf
49 set -f
50 ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \
34 while read i # this could be parallel 51 while read i # this could be parallel
35 do 52 do
53 set +f
36 id=${p#CC-MAIN-*}-00$i 54 id=${p#CC-MAIN-*}-00$i
37 lf=logs/${jobid}_${i}_log 55 lf=logs/${jobid}_${i}_log
38 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf 56 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
39 done 57 ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf
40 # now compare (logs? ls vs. tar -tvf ?) 58 set -f
59 echo "*-00${i}_*"
60 done))
61 # now compare ls vs. tar
62 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
63 tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
64 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
65 cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt
66 ni=${#ii[@]}
67 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
68 then
69 echo "extra lines in ${e}_check.txt" 1>&2
70 break
71 fi
72 if [ $(fgrep -c a ${e}_check.txt) -ne $ni ]
73 then
74 echo "non-addition lines in ${e}_check.txt" 1>&2
75 break
76 fi
77 egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt
78 set +f
41 done 79 done
42 cd .. 80 cd ..
43 done 81 done