Mercurial > hg > cc > cirrus_home
view bin/reExtract.sh @ 41:64227ff87e4e
done through re-extraction, fixing tars still to come
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 03 Apr 2020 17:35:17 +0100 |
parents | 0d1f6c971d5e |
children | 8ff97ea0ba2c |
line wrap: on
line source
#!/usr/bin/bash # Usage: reExtract.sh 20..-.. < files... # Input is list of paths to relative path of warc files # under /beegfs/common_crawl/CC-MAIN-$1 function edex () { echo "$1" | \ cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) } h=/beegfs/common_crawl/CC-MAIN-$1 mkdir -p /dev/shm/rex cd /dev/shm/rex sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ while read s p i do printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i done > in.txt cut -f 2,3 in.txt| sort -u | \ while read s p do mkdir -p $s/logs cd $s # Sigh, shouldn't have used this in the extraction ... jobid=$(tar -tf ${h}/${s}/extract_0.tar logs/ | head -1 | \ cut -f 2 -d / cut -f 1 -d _) egrep "\b$s\b" in.txt |cut -f 1,4 | \ while read e i do id=${p}-00$i lf=logs/${jobid}_${i}_log unpigz -dp 1 -c ${h}/${s}/CC-MAIN-${id}.warc.gz | $HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> $lf done done