comparison bin/hawk.sh @ 16:47ef882acbec

use awk to do a join between links and 1132dates
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 26 Feb 2020 21:50:25 +0000
parents
children
comparison
equal deleted inserted replaced
15:a96fb2c26c80 16:47ef882acbec
1 #!/bin/bash
2 # Usage: hawk.sh tarFileNumber
3 pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
4 links=$pdfs/links
5 dates=$pdfs/1132dates
6 tfn=$1
7 awk '{ if (NR == FNR) {
8 ll[$1]=1
9 }
10 else {
11 f="links_'${tfn}'_"$1
12 if (f in ll) {
13 print $0,"\t",f
14 }
15 }
16 }' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv