# HG changeset patch # User Henry S. Thompson # Date 1587754636 -3600 # Node ID 0520ee00e35b61472971f1fe4aa00be31982c1bb # Parent d39fd9c7f1be7847354516b1ade82994e6b0f6d1 misc diff -r d39fd9c7f1be -r 0520ee00e35b bin/splitTars.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/splitTars.sh Fri Apr 24 19:57:16 2020 +0100 @@ -0,0 +1,18 @@ +#!/bin/bash +for segid in $(cat /beegfs/common_crawl/CC-MAIN-2019-35/bigtar.txt) +do + echo $segid starting + ccid=2019-35 + h=/beegfs/common_crawl/CC-MAIN-${ccid} + mkdir -p /dev/shm/rex/${segid} + cd /dev/shm/rex/${segid} + tar -xf ${h}/${segid}/extracts.tar + pfx=$(ls ${h}/${segid}/CC-MAIN-*-00000.warc.gz |\ + cut -f 6 -d / | cut -f 3,4 -d -) + cat ../by11s.txt | while read i j + do ((n=i/11)) + tar -cf /beegfs/common_crawl/CC-MAIN-${ccid}/${segid}/extract_${n}.tar \ + $(seq $i $j | xargs -I ^ bash -c "{ k=^; printf \"\${0}-%05.0f_* logs/*_%03.0f_log\n\" $k $k ; }" $pfx) + done && + echo $(date) $segid done +done diff -r d39fd9c7f1be -r 0520ee00e35b by11s.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/by11s.txt Fri Apr 24 19:57:16 2020 +0100 @@ -0,0 +1,51 @@ +0 10 +11 21 +22 32 +33 43 +44 54 +55 65 +66 76 +77 87 +88 98 +99 109 +110 120 +121 131 +132 142 +143 153 +154 164 +165 175 +176 186 +187 197 +198 208 +209 219 +220 230 +231 241 +242 252 +253 263 +264 274 +275 285 +286 296 +297 307 +308 318 +319 329 +330 340 +341 351 +352 362 +363 373 +374 384 +385 395 +396 406 +407 417 +418 428 +429 439 +440 450 +451 461 +462 472 +473 483 +484 494 +495 505 +506 516 +517 527 +528 538 +539 549 +550 559