Mercurial > hg > cc > cirrus_home
changeset 45:bd0010ac88ce
parallelised version of reExtract.sh
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 08 Apr 2020 11:27:33 +0100 |
parents | abc1b05996c9 |
children | 2e5b3439a2ed |
files | bin/preExtract.sh |
diffstat | 1 files changed, 90 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/preExtract.sh Wed Apr 08 11:27:33 2020 +0100 @@ -0,0 +1,90 @@ +#!/usr/bin/bash +# Usage: reExtract.sh 20..-.. < files... +# Input is list of paths to relative path of warc files +# under /beegfs/common_crawl/CC-MAIN-$1 + +function sus () +{ + sort "$@" | uniq -c | sort -k1nr,1 +} + +function edex () { + echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1)) + } + +function join_by () { + # Courtesy of https://stackoverflow.com/a/17841619/2595465 + local d=$1 + shift + echo -n "$1" + shift + printf "%s" "${@/#/$d}" +} + +h=/beegfs/common_crawl/CC-MAIN-$1 + +mkdir -p /dev/shm/rex +cd /dev/shm/rex + +sed 's/-00/ /;s/.warc.gz//' | \ + while read s p i + do + printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i + done > in.txt +cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\ + parallel --will-cite -j 16 -N 2 h="$h"' + function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } + s={1} + p={2} + mkdir -p $s/logs + cd $s + # Sigh, should not have used this in the extraction ... + jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ + cut -f 2 -d / |cut -f 1 -d _) + for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) + # this could be parallel + do + lsf=lsl${e}.txt + rm -f $lsf + lff=() + ii=() + for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) + # this could be parallel + do + id=${p#CC-MAIN-*}-00$i + echo "$id" 1>&2 + lf=logs/${jobid}_${i}_log + lff+=("${lf}") # accumulate list of log files + echo starting ${id} $(date) > $lf + unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf + echo finished ${id} $(date) >> $lf + ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf + ii+=("*-00${i}_*") + echo "$i" "${ii[@]}" ${#ii[@]} 1>&2 + done + # now compare ls vs. tar + echo "${ii[@]}" ${#ii[@]} 1>&2 + echo lff "${lff[@]}" 1>&2 + tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ + tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ + - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt + cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt + ni=${#ii[@]} + if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] + then + echo "extra lines in ${e}_check.txt" 1>&2 + cd .. + break + fi + if [ $(fgrep -c a ${e}_check.txt) -ne $ni ] + then + echo "non-addition lines in ${e}_check.txt" 1>&2 + cd .. + break + fi + egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt + tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" + tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" + done + cd .. +'