view bin/doHdr.sh @ 19:c858a4d4bd4f

copied from valhalla/bin
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 16 Mar 2020 15:57:23 +0000
parents a96fb2c26c80
children
line wrap: on
line source

#!/usr/bin/bash
hn=$1
jn=$2
tfn=$3

mkdir -p /dev/shm/x$hn/${tfn}
cd /dev/shm/x$hn/${tfn}
tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.hdr'
echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2
ls *.hdr | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
while read n f; do echo "$n	"$(egrep -a '^X-HST-Target-URI: ' $f|cut -f 2- -d ' ')"	"$(egrep -ia '^Last-Modified: ' $f|cut -f 2- -d ' '); done >> hdrs_$tfn.tsv
echo $(date) $hn moving $(ls hdrs_*.tsv|wc -l) results from job $jn for $tfn in $(pwd) 1>&2
mv hdrs_$tfn.tsv /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/hdrs
rm *
cd ..
echo $(date) $(pwd) rmdir ${tfn} 1>&2
rmdir ${tfn}
echo $(date) $hn finished job ${jn} for ${tfn}