diff bin/reExtract.sh @ 39:822cfbf134d3

towards re-running extraction in part
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 02 Apr 2020 19:14:23 +0100
parents
children 0d1f6c971d5e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/reExtract.sh	Thu Apr 02 19:14:23 2020 +0100
@@ -0,0 +1,30 @@
+#!/usr/bin/bash
+# Usage: reExtract.sh 20..-.. < files...
+# Input is list of paths to relative path of warc files
+#  under /beegfs/common_crawl/CC-MAIN-$1
+
+function edex () {
+    echo "$1" | \
+	cut -f 5 -d - | cut -f 1 -d .|sed 's/^00//' | \
+	echo $(($(fgrep -n -f - ~/by11n.txt | cut -f 1 -d :) - 1))
+    }
+
+h=/beegfs/common_crawl/CC-MAIN-$1
+
+mkdir -p /dev/shm/rex
+cd /dev/shm/rex
+
+sed 's/\// /;s/-00/ /;s/.warc.gz//' | \
+ while read s p i
+ do
+    printf "%s\t%s\t%s\t%s\n" $(edex $i) $s $p $i
+ done > in.txt
+for s in $(cut -f 2 in.txt| sort -u)
+do
+    mkdir -p $s
+    cd $s
+    for e in $(egrep "\b$p\b" in.txt |cut -f 1 | sort -u)
+    do
+	tar -xf $h/$s/extract_$e.tar
+	
+