annotate bin/doHdr.sh @ 138:9ea12f7b304b

just barely working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 23 Jul 2021 16:23:46 +0000
parents a96fb2c26c80
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/bash
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 hn=$1
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 jn=$2
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 tfn=$3
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 mkdir -p /dev/shm/x$hn/${tfn}
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 cd /dev/shm/x$hn/${tfn}
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 tar -xf /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/${tfn}.tar '*.hdr'
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 echo $(date) $hn $(pwd) untarred ${tfn}.tar for job $jn 1>&2
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 ls *.hdr | sort --field-separator=_ -k1,1 -k2n,2 | cat -n |\
15
a96fb2c26c80 works after minor tweaks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 14
diff changeset
11 while read n f; do echo "$n "$(egrep -a '^X-HST-Target-URI: ' $f|cut -f 2- -d ' ')" "$(egrep -ia '^Last-Modified: ' $f|cut -f 2- -d ' '); done >> hdrs_$tfn.tsv
14
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 echo $(date) $hn moving $(ls hdrs_*.tsv|wc -l) results from job $jn for $tfn in $(pwd) 1>&2
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 mv hdrs_$tfn.tsv /beegfs/common_crawl/CC-MAIN-2019-35/pdfs/hdrs
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 rm *
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 cd ..
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 echo $(date) $(pwd) rmdir ${tfn} 1>&2
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 rmdir ${tfn}
9a1de2c4ffe3 modelled on plinks
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 echo $(date) $hn finished job ${jn} for ${tfn}