diff bin/unfold.sh @ 18:6662a353379a

fix a mis-folded link file
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 27 Feb 2020 17:18:02 +0000
parents
children a82c325e8b32
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/unfold.sh	Thu Feb 27 17:18:02 2020 +0000
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Usage: unfold.sh links_...
+pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
+links=$pdfs/links
+f=$1
+IFS='  '
+awk '{if (NR==1) { o=$1 ; u=$2 }
+else
+{ if ($1=="annot" || $1=="scrape") {
+print o,"\t",u
+o=$1
+u=$2
+}
+else {
+u=u $1     
+}             
+}
+}
+END { print o,"\t",u }' $f