changeset 16:47ef882acbec

use awk to do a join between links and 1132dates
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 26 Feb 2020 21:50:25 +0000
parents a96fb2c26c80
children b976a7449d41
files bin/hawk.sh
diffstat 1 files changed, 16 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/hawk.sh	Wed Feb 26 21:50:25 2020 +0000
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Usage: hawk.sh tarFileNumber
+pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
+links=$pdfs/links
+dates=$pdfs/1132dates
+tfn=$1
+awk '{ if (NR == FNR) {
+ll[$1]=1
+}
+else {
+f="links_'${tfn}'_"$1
+if (f in ll) {
+print $0,"\t",f
+}
+}
+}' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv