Mercurial > hg > cc > cirrus_home
changeset 77:bfff01c139ea
bare framework working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 18:28:52 +0100 |
parents | 6cf3dc7ff022 |
children | 846b38f8b204 |
files | bin/track.py |
diffstat | 1 files changed, 35 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/track.py Wed May 06 14:25:44 2020 +0100 +++ b/bin/track.py Wed May 06 18:28:52 2020 +0100 @@ -5,6 +5,7 @@ import re,sys,glob,gzip,json CDX=re.compile("(.*)\)(.*) (\{.*\})$") +FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: @@ -16,6 +17,9 @@ ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) wwff=glob.glob(wwfp) ddff=glob.glob(ddfp) + wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) + buf=open("%s/%s/bu.txt"%(cc,seg)) + log=open("%s/%s/hadoop.log"%(cc,seg)) assert len(wwff)!=0,wwfp assert len(ddff)!=0,ddfp except: @@ -39,10 +43,40 @@ except: print(resFileName,n,c,file=sys.stderr) raise - print (n,len(res)) + #print (n,len(res),file=sys.stderr) return res fetches=readCDX(wwff) diags=readCDX(ddff) +trunc={} +for l in wtf: + if l.startswith('WARC-'): + (k,rest)=l.split(' ',1) + if k=='WARC-Target-URI:': + uri=rest.rstrip() + elif k=='WARC-Truncated:': + trunc[uri]=rest.rstrip() +bu=list(map(str.rstrip,buf.readlines())) +fails={} +for l in log: + r=FAIL.match(l) + if r: + (u,m2)=r.groups() + fails[u]=m2 +print("""For %s/%s: + %4s requested + %4s retrieved + %4s truncated + %4s diagnosed + %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)), + file=sys.stderr) + +for u in bu: + sig=0 + sig+=8 if u in fetches else 0 + sig+=4 if u in diags else 0 + sig+=2 if u in fails else 0 + sig+=1 if u in trunc else 0 + print(format(sig,'04b'))