view bin/hawk.sh @ 129:b51d65ed6c89

improve error handling
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 09 Jul 2021 13:45:43 +0000
parents 47ef882acbec
children
line wrap: on
line source

#!/bin/bash
# Usage: hawk.sh tarFileNumber
pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
links=$pdfs/links
dates=$pdfs/1132dates
tfn=$1
awk '{ if (NR == FNR) {
ll[$1]=1
}
else {
f="links_'${tfn}'_"$1
if (f in ll) {
print $0,"\t",f
}
}
}' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv