changeset 213:443b3a6f0b41

merge
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Feb 2024 16:36:00 +0000
parents 1728baad6540 (current diff) 0ffa655efc21 (diff)
children 94072b090fdd
files
diffstat 2 files changed, 8 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/bin/getcc_multi.aws	Thu Feb 15 15:10:34 2024 +0000
+++ b/bin/getcc_multi.aws	Thu Feb 15 16:36:00 2024 +0000
@@ -40,9 +40,8 @@
 mkdir -p $s/orig/warc
 fgrep -v -f <(cd $s/orig/warc && ls *.warc.gz || :) <(fgrep -w $s warc.paths) > /tmp/hst/$s
 
-split -a 1 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
-
-seq 1 $nthreads | while read i
+split -a 2 --numeric-suffixes=1 -n l/$nthreads /tmp/hst/$s /tmp/hst/${s}_
+printf "%02d\n" $(seq 1 $nthreads) | while read i
 do
   cat /tmp/hst/${s}_$i | {
   printf "thread\t%s\t%s\t%s\n" $i $$ $(ps -o pgid= -p "$$") >> errlog_${SEG}_$i
@@ -57,5 +56,5 @@
   sleep 30
   }
 done
-while pgrep -a aws |grep -c s3; do sleep 60; done
+while pgrep -a aws |grep -c "aws s3.*${s}"; do sleep 60; done
 echo $(date) end $SEG
--- a/lib/python/cc/lmh/lmh.py	Thu Feb 15 15:10:34 2024 +0000
+++ b/lib/python/cc/lmh/lmh.py	Thu Feb 15 16:36:00 2024 +0000
@@ -48,7 +48,11 @@
 
   infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
     CCdate, segment, filetype, fileno)
-  infile_name=glob.glob(infile_pat)[0]
+  try:
+    infile_name=glob.glob(infile_pat)[0]
+  except IndexError:
+    print(infile_pat,CCdata,segment,filetype,fileno,file=sys.stderr)
+    raise
 
   (_,_,_,_,_,_,_,ff)=infile_name.split('/')