# HG changeset patch # User Henry S. Thompson # Date 1725285721 -3600 # Node ID 0c472ae05f71db4761bc0852d70693a73fa1d105 # Parent 64b7fb44e8dc92b7c8f50ee12973a615dddb42ba nearly finished downloading for now diff -r 64b7fb44e8dc -r 0c472ae05f71 lurid3/notes.txt --- a/lurid3/notes.txt Wed Aug 21 16:11:40 2024 +0100 +++ b/lurid3/notes.txt Mon Sep 02 15:02:01 2024 +0100 @@ -71,3 +71,53 @@ 15 15 +Fill a gap by downloading 2022-33 + + >: for s in 0; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 5; done > /tmp/hst/get_22-33_0.log & + 130 minutes... + >: for s in 1; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_1.log & + 59 minutes + +Another day to get to a quarter? + >: for s in {2..23}; do ~/bin/getcc_multi.aws CC-MAIN-2022-33 $s 10; done > /tmp/hst/get_22-33_2-23.log & + + +And finally 2015-35 +Fetched in just 2 chunks, 0-9 and 10-99, e.g. + >: for s in {10..99}; do ~/bin/getcc_multi.aws CC-MAIN-2015-35 $s 10; done > /tmp/hst/get_15-35_10-99.log & + +Much smaller. +Compare 2023-40, with 900 files per segment: + >: lss */orig/warc/*-0023?.* | cut -f 5 -d ' ' | stats + n = 1000 + min = 1.14775e+09 + max = 1.26702e+09 + sum = 1.20192e+12 + mean = 1.20192e+09 + sd = 2.26049e+07 + +with 2015-35, with 353 files per segment + >: lss */orig/warc/*-0023?-* | cut -f 5 -d ' ' | stats + n = 930 + min = 1.66471e+08 [bug?] + max = 9.6322e+08 + sum = 8.54009e+11 + mean = 9.1829e+08 + sd = 8.48938e+07 + +The min files all come from segment 1440644060633.7, whose files are +_all_ small: + >: uz *00123-*.gz | wc -l + 12,759,931 +Compare to 1440644060103.8 + >: zcat *00123-*.gz | wc -l + 75,806,738 +Mystery + +Also faster +Compare 2023-40: + >: fgrep -h BST /tmp/hst/get_22-33_{2-23,24-49,50-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd + 98 19 256 75.1 25.2 +with 2015-35: + >: fgrep -h BST /tmp/hst/get_15-35_{0-9,10-99}.log | cut -f 1-7 -d ' ' | while read s; do if read e; then echo $((($(date --date="$e" +%s) - $(date --date="$s" +%s)) / 60)); fi; done | stats n min max mean sd + 95 15 40 32.4 2.90