Mercurial > hg > cc > cirrus_home
diff bin/doHdr.sh @ 14:9a1de2c4ffe3
modelled on plinks
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 26 Feb 2020 15:47:20 +0000 |
parents | |
children | a96fb2c26c80 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/doHdr.sh Wed Feb 26 15:47:20 2020 +0000 @@ -0,0 +1,19 @@ +#!/usr/bin/bash +hn=$1 +jn=$2 +tfn=$3 + +mkdir -p /dev/shm/x$hn/${tfn} +cd /dev/shm/x$hn/${tfn} +tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.hdr' +echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2 +ls *.hdr | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\ +while read n f; do echo "$n\t"$(egrep -a '^X-HST-Target-URI: ' $f|cut -f 2 -d ' ')"\t"$(egrep -ia '^Last-Modified: ' $f|cut -f 2 -d ' '); done >> hdrs_$tfn.tsv +echo $(date) $hn moving $(ls hdrs_*.tsv|wc -l) results from job $jn for $tfn in $(pwd) 1>&2 +mv hdrs_$tfn.tsv /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/hdrs +echo $(date) $(pwd) rm $(ls -lt hdrs_*.tsv) 1>&2 +rm * +cd .. +echo $(date) $(pwd) rmdir ${tfn} 1>&2 +rmdir ${tfn} +echo $(date) $hn finished job ${jn} for ${tfn}