Mercurial > hg > cc > cirrus_home
view bin/preExtract.sh @ 64:0520ee00e35b
misc
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 24 Apr 2020 19:57:16 +0100 |
parents | 8154560f1e3d |
children |
line wrap: on
line source
#!/usr/bin/bash # Usage: reExtract.sh 20..-.. < files... # Input is list of paths to relative path of warc files # under /beegfs/common_crawl/CC-MAIN-$1 function sus () { sort "$@" | uniq -c | sort -k1nr,1 } function edex () { echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1)) } function join_by () { # Courtesy of https://stackoverflow.com/a/17841619/2595465 local d=$1 shift echo -n "$1" shift printf "%s" "${@/#/$d}" } h=/beegfs/common_crawl/CC-MAIN-$1 mkdir -p /dev/shm/rex cd /dev/shm/rex sed 's/-00/ /;s/.warc.gz//' | \ while read s p i do printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i done > in.txt cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\ parallel --will-cite -j 16 -N 2 h="$h"' function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; } s={1} p={2} mkdir -p /dev/shm/rex/$s/logs cd /dev/shm/rex/$s echo $(date) starting $s/$p > log # Sigh, should not have used this in the extraction ... jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ cut -f 2 -d / |cut -f 1 -d _) for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u) # this could be parallel do echo $(date) begin extract: $e >> log lsf=lsl${e}.txt rm -f $lsf lff=() ii=() for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4) # this could be parallel do id=${p#CC-MAIN-*}-00$i echo " " "$id" >> log lf=logs/${jobid}_${i}_log lff+=("${lf}") # accumulate list of warc.sh log files if [ -s $lf ] then echo " " $lf not empty, skipping extraction >> log else echo " " extracting from $id >> log echo starting ${id} $(date) > $lf unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf echo finished ${id} $(date) >> $lf fi ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf ii+=("*-00${i}_*") echo " " "$i" "${ii[@]}" ${#ii[@]} >> log done echo " " extractions done >> log # now compare ls vs. tar echo " " "${ii[@]}" ${#ii[@]} >> log echo " " lff "${lff[@]}" >> log tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \ tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \ - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt if [ -s ${e}_diff.txt ] then echo " " checking... >> log cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt ni=${#ii[@]} if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ] then if egrep -n "^[1-9]" ${e}_diff.txt | \ $HOME/bin/nogood.py $e $ni 2>> log then continue fi fi echo " " starting tar update >> log egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}" tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}" else echo "no diff, no update" $e >> log fi echo end extract: $e >> log done '