Mercurial > hg > cc > cirrus_home
annotate bin/hawk.sh @ 16:47ef882acbec
use awk to do a join between links and 1132dates
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 26 Feb 2020 21:50:25 +0000 |
parents | |
children |
rev | line source |
---|---|
16
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 # Usage: hawk.sh tarFileNumber |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 links=$pdfs/links |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 dates=$pdfs/1132dates |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 tfn=$1 |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 awk '{ if (NR == FNR) { |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ll[$1]=1 |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 } |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 else { |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 f="links_'${tfn}'_"$1 |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 if (f in ll) { |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 print $0,"\t",f |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 } |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 } |
47ef882acbec
use awk to do a join between links and 1132dates
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 }' <(tar -tf $links/$tfn.tar) ${dates}/date_$tfn.tsv > ${dates}/join_$tfn.tsv |