Mercurial > hg > cc > cirrus_home
changeset 43:4b574613200c
added computation of required additions to tar file, but not actually added
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 04 Apr 2020 15:31:58 +0100 |
parents | 8ff97ea0ba2c |
children | abc1b05996c9 |
files | bin/reExtract.sh |
diffstat | 1 files changed, 41 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/reExtract.sh Fri Apr 03 19:04:06 2020 +0100 +++ b/bin/reExtract.sh Sat Apr 04 15:31:58 2020 +0100 @@ -3,12 +3,26 @@ # Input is list of paths to relative path of warc files # under /beegfs/common_crawl/CC-MAIN-$1 +function sus () +{ + sort "$@" | uniq -c | sort -k1nr,1 +} + function edex () { echo "$1" | \ cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) } +function join_by () { + # Courtesy of https://stackoverflow.com/a/17841619/2595465 + local d=$1 + shift + echo -n "$1" + shift + printf "%s" "${@/#/$d}" +} + h=/beegfs/common_crawl/CC-MAIN-$1 mkdir -p /dev/shm/rex @@ -30,14 +44,38 @@ egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \ while read e # this could be parallel do - egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \ + lsf=lsl${e}.txt + rm -f $lsf + set -f + ii=($(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4 | \ while read i # this could be parallel do + set +f id=${p#CC-MAIN-*}-00$i lf=logs/${jobid}_${i}_log unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf - done - # now compare (logs? ls vs. tar -tvf ?) + ls -l ${id}_* | tr -s ' ' '\011' |cut -f 5,9 >> $lsf + set -f + echo "*-00${i}_*" + done)) + # now compare ls vs. tar + tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ + tr -s ' ' '\011' |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ + - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt + cut -f 1 -d ' ' ${e}_diff.txt | sus > ${e}_check.txt + ni=${#ii[@]} + if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] + then + echo "extra lines in ${e}_check.txt" 1>&2 + break + fi + if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] + then + echo "non-addition lines in ${e}_check.txt" 1>&2 + break + fi + egrep '^> ' ${e}_diff.txt | cut -f 2 > ${e}_new.txt + set +f done cd .. done