annotate bin/extract.sh @ 26:a9a9f6f1832e
finally hacked something that works
author |
Henry S. Thompson <ht@inf.ed.ac.uk> |
date |
Wed, 18 Mar 2020 13:42:47 +0000 |
parents |
a82c325e8b32 |
children |
7a2bc060230d |
rev |
line source |
19
|
1 #!/bin/bash
|
25
|
2 # Usage extract.sh ccid segid
|
|
3 ccid=$1
|
|
4 segid=$2
|
|
5 @@me=$$
|
|
6 @@cd $SHARED/data/$(hostname)
|
19
|
7 mkdir -p logs
|
|
8 while read id
|
|
9 do
|
|
10 echo starting ${id} $(date) >> logs/${me}_log
|
25
|
11 unpigz -dp 1 -c /beegfs/common-crawl/CC-MAIN-${ccid}/${segid}/CC-MAIN-${id}.warc.gz|$HOME/lib/valhalla/bin/warc.sh ${id} application/pdf 2>> logs/@@${me}_log
|
|
12 echo finished ${id} $(date) >> logs/@@${me}_log
|
19
|
13 done
|