Mercurial > hg > cc > cirrus_home
annotate bin/extract.sh @ 25:a82c325e8b32
(none)
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 18 Mar 2020 11:08:47 +0000 |
parents | c858a4d4bd4f |
children | 7a2bc060230d |
rev | line source |
---|---|
19 | 1 #!/bin/bash |
25 | 2 # Usage extract.sh ccid segid |
3 ccid=$1 | |
4 segid=$2 | |
5 @@me=$$ | |
6 @@cd $SHARED/data/$(hostname) | |
19 | 7 mkdir -p logs |
8 while read id | |
9 do | |
10 echo starting ${id} $(date) >> logs/${me}_log | |
25 | 11 unpigz -dp 1 -c /beegfs/common-crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-${id}.warc.gz|$HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> logs/@@${me}_log |
12 echo finished ${id} $(date) >> logs/@@${me}_log | |
19 | 13 done |