changeset 64:0520ee00e35b

misc
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 24 Apr 2020 19:57:16 +0100
parents d39fd9c7f1be
children e71aeb3355ff
files bin/splitTars.sh by11s.txt
diffstat 2 files changed, 69 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/splitTars.sh	Fri Apr 24 19:57:16 2020 +0100
@@ -0,0 +1,18 @@
+#!/bin/bash
+for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt)
+do
+  echo $segid starting
+  ccid=2019-35
+  h=/beegfs/common_crawl/CC-MAIN-${ccid}
+  mkdir -p /dev/shm/rex/${segid}
+  cd /dev/shm/rex/${segid}
+  tar -xf ${h}/${segid}/extracts.tar
+  pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\
+        cut -f 6 -d / | cut -f 3,4 -d -)
+  cat ../by11s.txt | while read i j
+     do ((n=i/11))
+     tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \
+       $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx)
+     done &&
+  echo $(date) $segid done
+done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/by11s.txt	Fri Apr 24 19:57:16 2020 +0100
@@ -0,0 +1,51 @@
+0 10
+11 21
+22 32
+33 43
+44 54
+55 65
+66 76
+77 87
+88 98
+99 109
+110 120
+121 131
+132 142
+143 153
+154 164
+165 175
+176 186
+187 197
+198 208
+209 219
+220 230
+231 241
+242 252
+253 263
+264 274
+275 285
+286 296
+297 307
+308 318
+319 329
+330 340
+341 351
+352 362
+363 373
+374 384
+385 395
+396 406
+407 417
+418 428
+429 439
+440 450
+451 461
+462 472
+473 483
+484 494
+495 505
+506 516
+517 527
+528 538
+539 549
+550 559