view bin/preExtract.sh @ 138:9ea12f7b304b

just barely working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 23 Jul 2021 16:23:46 +0000
parents 8154560f1e3d
children
line wrap: on
line source

#!/usr/bin/bash
# Usage: reExtract.sh 20..-.. < files...
# Input is list of paths to relative path of warc files
#  under /beegfs/common_crawl/CC-MAIN-$1

function sus () 
{ 
    sort "$@" | uniq -c | sort -k1nr,1
}

function edex () {
    echo $(($(fgrep -n $1 ~/by11n.txt | cut -f 1 -d :) - 1))
    }

function join_by () {
    # Courtesy of https://stackoverflow.com/a/17841619/2595465
    local d=$1
    shift
    echo -n "$1"
    shift
    printf "%s" "${@/#/$d}"
}

h=/beegfs/common_crawl/CC-MAIN-$1

mkdir -p /dev/shm/rex
cd /dev/shm/rex

sed 's/-00/ /;s/.warc.gz//' | \
 while read s p i
 do
    printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
 done > in.txt
cut -f 2,3 in.txt| sort -u | tr '\011' '\012' |\
 parallel --will-cite -j 16 -N 2 h="$h"'
    function sus () { sort "$@" | uniq -c | sort -k1nr,1 ; }
    s={1}
    p={2}
    mkdir -p /dev/shm/rex/$s/logs
    cd /dev/shm/rex/$s
    echo $(date) starting $s/$p > log
    # Sigh, should not have used this in the extraction ...
    jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
	     cut -f 2 -d / |cut -f 1 -d _)
    for e in $(egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u)
     # this could be parallel
     do
        echo $(date) begin extract: $e >> log
	lsf=lsl${e}.txt
	rm -f $lsf
	lff=()
	ii=()
	for i in $(egrep "^$e\\s$s\\b" ../in.txt|cut -f 4)
	 # this could be parallel
	 do
	    id=${p#CC-MAIN-*}-00$i
	    echo " " "$id" >> log
	    lf=logs/${jobid}_${i}_log
	    lff+=("${lf}") # accumulate list of warc.sh log files
            if [ -s $lf ]
            then
              echo "  " $lf not empty, skipping extraction >> log
            else
              echo "  " extracting from $id >> log
	      echo starting ${id} $(date) > $lf
	      unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
              echo finished ${id} $(date) >> $lf
            fi
	    ls -l ${id}_* | tr -s " " "\t" |cut -f 5,9 >> $lsf
	    ii+=("*-00${i}_*")
	    echo "  " "$i" "${ii[@]}" ${#ii[@]} >> log
	 done
        echo " " extractions done >> log
	# now compare ls vs. tar
	echo " " "${ii[@]}"  ${#ii[@]} >> log
	echo " " lff "${lff[@]}" >> log
	tar -tvf ${h}/${s}/extract_${e}.tar "${ii[@]}" | \
	       tr -s " " "\t" |cut -f 3,6 |sort -k2.1,2.36 -k2.38,2n | diff -bw \
	    - <(sort -k2.1,2.36 -k2.38,2n $lsf) > ${e}_diff.txt
        if [ -s ${e}_diff.txt ]
        then
          echo " " checking... >> log
	  cut -f 1 -d " " ${e}_diff.txt | sus > ${e}_check.txt
	  ni=${#ii[@]}
	  if [ $(wc -l < ${e}_check.txt) -ne $((ni + 1)) ]
	  then
            if egrep -n "^[1-9]" ${e}_diff.txt | \
                  $HOME/bin/nogood.py $e $ni 2>> log
            then
              continue
            fi
	  fi
          echo " " starting tar update >> log
	  egrep "^> " ${e}_diff.txt | cut -f 2 > ${e}_new.txt
	  tar --delete -f ${h}/${s}/extract_${e}.tar "${lff[@]}"
	  tar --append -f ${h}/${s}/extract_${e}.tar --files-from=${e}_new.txt "${lff[@]}"
        else
          echo "no diff, no update" $e >> log
        fi
        echo end extract: $e >> log
     done   
'