Mercurial > hg > cc > valhalla
annotate bin/extract.sh @ 0:fdd3f8a16fd4 default tip
shared scripts on valhalla cluster
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Sat, 14 Mar 2020 11:00:58 +0000 |
parents | |
children |
rev | line source |
---|---|
0
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
1 #!/bin/bash |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
2 me=$$ |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
3 SHARED=/home/shared/ht |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
4 cd $SHARED/data/$(hostname) |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
5 mkdir -p logs |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
6 while read id |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
7 do |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
8 echo starting ${id} $(date) >> logs/${me}_log |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
9 unpigz -dp 1 -c /data/common_crawl/CC-MAIN-2019-35/CC-MAIN-${id}.warc.gz|$SHARED/bin/warc.sh ${id} application/pdf 2>> logs/${me}_log |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
10 echo finished ${id} $(date) >> logs/${me}_log |
fdd3f8a16fd4
shared scripts on valhalla cluster
Henry Thompson <ht@markup.co.uk>
parents:
diff
changeset
|
11 done |