Mercurial > hg > cc > cirrus_home
diff bin/reExtract.sh @ 39:822cfbf134d3
towards re-running extraction in part
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 02 Apr 2020 19:14:23 +0100 |
parents | |
children | 0d1f6c971d5e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/reExtract.sh Thu Apr 02 19:14:23 2020 +0100 @@ -0,0 +1,30 @@ +#!/usr/bin/bash +# Usage: reExtract.sh 20..-.. < files... +# Input is list of paths to relative path of warc files +# under /beegfs/common_crawl/CC-MAIN-$1 + +function edex () { + echo "$1" | \ + cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \ + echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1)) + } + +h=/beegfs/common_crawl/CC-MAIN-$1 + +mkdir -p /dev/shm/rex +cd /dev/shm/rex + +sed 's/\// /;s/-00/ /;s/.warc.gz//' | \ + while read s p i + do + printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i + done > in.txt +for s in $(cut -f 2 in.txt| sort -u) +do + mkdir -p $s + cd $s + for e in $(egrep "\b$p\b" in.txt |cut -f 1 | sort -u) + do + tar -xf $h/$s/extract_$e.tar + +