annotate bin/unfold.sh @ 18:6662a353379a

fix a mis-folded link file
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 27 Feb 2020 17:18:02 +0000
parents
children a82c325e8b32
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/bin/bash
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 # Usage: unfold.sh links_...
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 links=$pdfs/links
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 f=$1
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 IFS=' '
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 awk '{if (NR==1) { o=$1 ; u=$2 }
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 else
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 { if ($1=="annot" || $1=="scrape") {
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 print o,"\t",u
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 o=$1
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 u=$2
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 }
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 else {
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 u=u $1
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 }
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 }
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 }
6662a353379a fix a mis-folded link file
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 END { print o,"\t",u }' $f