view bin/unfold.sh @ 188:0c5422df3a67

runs, but no cdx yet, because no value.content I presume
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 23 Sep 2024 19:18:36 +0100
parents a82c325e8b32
children
line wrap: on
line source

#!/bin/bash
# Usage: unfold.sh links_...
pdfs=/beegfs/common_crawl/CC-MAIN-2019-35/pdfs
links=$pdfs/links
f=$1
IFS='	'
awk '{if (NR==1) { o=$1 ; u=$2 }
else
{ if ($1=="annot" || $1=="scrape") {
print o,"\t",u
o=$1
u=$2
}
else {
u=u $1     
}             
}
}
END { print o,"\t",u }' $f