# HG changeset patch # User Henry S. Thompson # Date 1588786132 -3600 # Node ID bfff01c139ea624a469f02d87580bdbd0abdc211 # Parent 6cf3dc7ff0227d7e59b9b25bc7893a3ac10f961f bare framework working diff -r 6cf3dc7ff022 -r bfff01c139ea bin/track.py --- a/bin/track.py Wed May 06 14:25:44 2020 +0100 +++ b/bin/track.py Wed May 06 18:28:52 2020 +0100 @@ -5,6 +5,7 @@ import re,sys,glob,gzip,json CDX=re.compile("(.*)\)(.*) (\{.*\})$") +FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: @@ -16,6 +17,9 @@ ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) wwff=glob.glob(wwfp) ddff=glob.glob(ddfp) + wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) + buf=open("%s/%s/bu.txt"%(cc,seg)) + log=open("%s/%s/hadoop.log"%(cc,seg)) assert len(wwff)!=0,wwfp assert len(ddff)!=0,ddfp except: @@ -39,10 +43,40 @@ except: print(resFileName,n,c,file=sys.stderr) raise - print (n,len(res)) + #print (n,len(res),file=sys.stderr) return res fetches=readCDX(wwff) diags=readCDX(ddff) +trunc={} +for l in wtf: + if l.startswith('WARC-'): + (k,rest)=l.split(' ',1) + if k=='WARC-Target-URI:': + uri=rest.rstrip() + elif k=='WARC-Truncated:': + trunc[uri]=rest.rstrip() +bu=list(map(str.rstrip,buf.readlines())) +fails={} +for l in log: + r=FAIL.match(l) + if r: + (u,m2)=r.groups() + fails[u]=m2 +print("""For %s/%s: + %4s requested + %4s retrieved + %4s truncated + %4s diagnosed + %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)), + file=sys.stderr) + +for u in bu: + sig=0 + sig+=8 if u in fetches else 0 + sig+=4 if u in diags else 0 + sig+=2 if u in fails else 0 + sig+=1 if u in trunc else 0 + print(format(sig,'04b'))