comparison bin/reExtract.sh @ 39:822cfbf134d3

towards re-running extraction in part
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Apr 2020 19:14:23 +0100
parents
children 0d1f6c971d5e
comparison
equal deleted inserted replaced
38:03abcdce54a0 39:822cfbf134d3
1 #!/usr/bin/bash
2 # Usage: reExtract.sh 20..-.. < files...
3 # Input is list of paths to relative path of warc files
4 # under /beegfs/common_crawl/CC-MAIN-$1
5
6 function edex () {
7 echo "$1" | \
8 cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
9 echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
10 }
11
12 h=/beegfs/common_crawl/CC-MAIN-$1
13
14 mkdir -p /dev/shm/rex
15 cd /dev/shm/rex
16
17 sed 's/\// /;s/-00/ /;s/.warc.gz//' | \
18 while read s p i
19 do
20 printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
21 done > in.txt
22 for s in $(cut -f 2 in.txt| sort -u)
23 do
24 mkdir -p $s
25 cd $s
26 for e in $(egrep "\b$p\b" in.txt |cut -f 1 | sort -u)
27 do
28 tar -xf $h/$s/extract_$e.tar
29
30