view bin/reExtract.sh @ 42:8ff97ea0ba2c

refactored, not tested
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 03 Apr 2020 19:04:06 +0100
parents 64227ff87e4e
children 4b574613200c
line wrap: on
line source

#!/usr/bin/bash
# Usage: reExtract.sh 20..-.. < files...
# Input is list of paths to relative path of warc files
#  under /beegfs/common_crawl/CC-MAIN-$1

function edex () {
    echo "$1" | \
	cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
	echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
    }

h=/beegfs/common_crawl/CC-MAIN-$1

mkdir -p /dev/shm/rex
cd /dev/shm/rex

sed 's/\// /;s/-00/ /;s/.warc.gz//' | \
 while read s p i
 do
    printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
 done > in.txt
cut -f 2,3 in.txt| sort -u | \
 while read s p
 do
    mkdir -p $s/logs
    cd $s
    # Sigh, shouldn't have used this in the extraction ...
    jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \
	     cut -f 2 -d / |cut -f 1 -d _)
    egrep "\b$s\b" ../in.txt |cut -f 1 | sort -u | \
     while read e # this could be parallel
     do
	egrep "^$e\b$s\b" ../in.txt|cut -f 4 | \
	 while read i # this could be parallel
	 do
	    id=${p#CC-MAIN-*}-00$i
	    lf=logs/${jobid}_${i}_log
	    unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf
	 done
	# now compare (logs? ls vs. tar -tvf ?)
     done
     cd ..
 done