annotate bin/hawk.sh @ 16:47ef882acbec

use awk to do a join between links and 1132dates
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 26 Feb 2020 21:50:25 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: hawk.sh tarFileNumber
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 links=$pdfs/links
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 dates=$pdfs/1132dates
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 tfn=$1
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 awk '{ if (NR == FNR) {
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 ll[$1]=1
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 }
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 else {
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 f="links_'${tfn}'_"$1
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 if (f in ll) {
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 print $0,"\t",f
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 }
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 }
47ef882acbec use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 }' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv