Mercurial > hg > cc > cirrus_work
changeset 213:443b3a6f0b41
merge
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 15 Feb 2024 16:36:00 +0000 |
parents | 1728baad6540 (current diff) 0ffa655efc21 (diff) |
children | 94072b090fdd |
files | |
diffstat | 2 files changed, 8 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/getcc_multi.aws Thu Feb 15 15:10:34 2024 +0000 +++ b/bin/getcc_multi.aws Thu Feb 15 16:36:00 2024 +0000 @@ -40,9 +40,8 @@ mkdir -p $s/orig/warc fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s -split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ - -seq 1 $nthreads | while read i +split -a 2 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_ +printf "%02d\n" $(seq 1 $nthreads) | while read i do cat /tmp/hst/${s}_$i | { printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i @@ -57,5 +56,5 @@ sleep 30 } done -while pgrep -a aws |grep -c s3; do sleep 60; done +while pgrep -a aws |grep -c "aws s3.*${s}"; do sleep 60; done echo $(date) end $SEG
--- a/lib/python/cc/lmh/lmh.py Thu Feb 15 15:10:34 2024 +0000 +++ b/lib/python/cc/lmh/lmh.py Thu Feb 15 16:36:00 2024 +0000 @@ -48,7 +48,11 @@ infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%( CCdate, segment, filetype, fileno) - infile_name=glob.glob(infile_pat)[0] + try: + infile_name=glob.glob(infile_pat)[0] + except IndexError: + print(infile_pat,CCdata,segment,filetype,fileno,file=sys.stderr) + raise (_,_,_,_,_,_,_,ff)=infile_name.split('/')