# HG changeset patch # User Henry S. Thompson # Date 1582753825 0 # Node ID 47ef882acbecb4658dfe363faebc96d460803128 # Parent a96fb2c26c8033ce48a23f24971a915afb97cdf1 use awk to do a join between links and 1132dates diff -r a96fb2c26c80 -r 47ef882acbec bin/hawk.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/hawk.sh Wed Feb 26 21:50:25 2020 +0000 @@ -0,0 +1,16 @@ +#!/bin/bash +# Usage: hawk.sh tarFileNumber +pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs +links=$pdfs/links +dates=$pdfs/1132dates +tfn=$1 +awk '{ if (NR == FNR) { +ll[$1]=1 +} +else { +f="links_'${tfn}'_"$1 +if (f in ll) { +print $0,"\t",f +} +} +}' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv