Mercurial > hg > cc > cirrus_home
comparison bin/reExtract.sh @ 43:4b574613200c
added computation of required additions to tar file, but not actually added
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 04 Apr 2020 15:31:58 +0100 |
parents | 8ff97ea0ba2c |
children | abc1b05996c9 |
comparison
equal
deleted
inserted
replaced
42:8ff97ea0ba2c | 43:4b574613200c |
---|---|
1 #!/usr/bin/bash | 1 #!/usr/bin/bash |
2 # Usage: reExtract.sh 20..-.. < files... | 2 # Usage: reExtract.sh 20..-.. < files... |
3 # Input is list of paths to relative path of warc files | 3 # Input is list of paths to relative path of warc files |
4 # under /beegfs/common_crawl/CC-MAIN-$1 | 4 # under /beegfs/common_crawl/CC-MAIN-$1 |
5 | |
6 function sus () | |
7 { | |
8 sort "$@" | uniq -c | sort -k1nr,1 | |
9 } | |
5 | 10 |
6 function edex () { | 11 function edex () { |
7 echo "$1" | \ | 12 echo "$1" | \ |
8 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ | 13 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ |
9 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) | 14 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) |
10 } | 15 } |
16 | |
17 function join_by () { | |
18 # Courtesy of https://stackoverflow.com/a/17841619/2595465 | |
19 local d=$1 | |
20 shift | |
21 echo -n "$1" | |
22 shift | |
23 printf "%s" "${@/#/$d}" | |
24 } | |
11 | 25 |
12 h=/beegfs/common_crawl/CC-MAIN-$1 | 26 h=/beegfs/common_crawl/CC-MAIN-$1 |
13 | 27 |
14 mkdir -p /dev/shm/rex | 28 mkdir -p /dev/shm/rex |
15 cd /dev/shm/rex | 29 cd /dev/shm/rex |
28 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ | 42 jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ |
29 cut -f 2 -d / |cut -f 1 -d _) | 43 cut -f 2 -d / |cut -f 1 -d _) |
30 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ | 44 egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ |
31 while read e # this could be parallel | 45 while read e # this could be parallel |
32 do | 46 do |
33 egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ | 47 lsf=lsl${e}.txt |
48 rm -f $lsf | |
49 set -f | |
50 ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \ | |
34 while read i # this could be parallel | 51 while read i # this could be parallel |
35 do | 52 do |
53 set +f | |
36 id=${p#CC-MAIN-*}-00$i | 54 id=${p#CC-MAIN-*}-00$i |
37 lf=logs/${jobid}_${i}_log | 55 lf=logs/${jobid}_${i}_log |
38 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf | 56 unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf |
39 done | 57 ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf |
40 # now compare (logs? ls vs. tar -tvf ?) | 58 set -f |
59 echo "*-00${i}_*" | |
60 done)) | |
61 # now compare ls vs. tar | |
62 tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ | |
63 tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ | |
64 - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt | |
65 cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt | |
66 ni=${#ii[@]} | |
67 if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] | |
68 then | |
69 echo "extra lines in ${e}_check.txt" 1>&2 | |
70 break | |
71 fi | |
72 if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] | |
73 then | |
74 echo "non-addition lines in ${e}_check.txt" 1>&2 | |
75 break | |
76 fi | |
77 egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt | |
78 set +f | |
41 done | 79 done |
42 cd .. | 80 cd .. |
43 done | 81 done |