# HG changeset patch # User Henry S. Thompson # Date 1708014960 0 # Node ID 443b3a6f0b418ac0d061a042a7730f14a521be7d # Parent 1728baad654048e25b83973f36e96729b5ba5d04# Parent 0ffa655efc219b7d68621a5cee8ead1c27512e33 merge diff -r 1728baad6540 -r 443b3a6f0b41 bin/getcc_multi.aws --- a/bin/getcc_multi.aws Thu Feb 15 15:10:34 2024 +0000 +++ b/bin/getcc_multi.aws Thu Feb 15 16:36:00 2024 +0000 @@ -40,9 +40,8 @@ mkdir -p $s/orig/warc fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s -split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ - -seq 1 $nthreads | while read i +split -a 2 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ +printf "%02d\n" $(seq 1 $nthreads) | while read i do cat /tmp/hst/${s}_$i | { printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i @@ -57,5 +56,5 @@ sleep 30 } done -while pgrep -a aws |grep -c s3; do sleep 60; done +while pgrep -a aws |grep -c "aws s3.*${s}"; do sleep 60; done echo $(date) end $SEG diff -r 1728baad6540 -r 443b3a6f0b41 lib/python/cc/lmh/lmh.py --- a/lib/python/cc/lmh/lmh.py Thu Feb 15 15:10:34 2024 +0000 +++ b/lib/python/cc/lmh/lmh.py Thu Feb 15 16:36:00 2024 +0000 @@ -48,7 +48,11 @@ infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( CCdate, segment, filetype, fileno) - infile_name=glob.glob(infile_pat)[0] + try: + infile_name=glob.glob(infile_pat)[0] + except IndexError: + print(infile_pat,CCdata,segment,filetype,fileno,file=sys.stderr) + raise (_,_,_,_,_,_,_,ff)=infile_name.split('/')