Mercurial > hg > cc > cirrus_home
view bin/track.py @ 78:846b38f8b204
refactor, change summary print (problem?)
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 May 2020 11:33:24 +0100 |
parents | bfff01c139ea |
children | 4b8e4e3d60eb |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 '''Track a list of URIs through nutch results''' # Usage: track.py year-nn segmentid [file] import re,sys,glob,gzip,json,urllib.parse CDX=re.compile("(.*)\)(.*) (\{.*\})$") FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: if len(sys.argv)==4: uuf=open(sys.argv[3]) else: uuf=sys.stdin wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) wwff=glob.glob(wwfp) ddff=glob.glob(ddfp) wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) buf=open("%s/%s/bu.txt"%(cc,seg)) log=open("%s/%s/hadoop.log"%(cc,seg)) assert len(wwff)!=0,wwfp assert len(ddff)!=0,ddfp except: print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) raise BU={} U={} class URI: uu={} def __init__(self,s): self.s=s self.d_props=None # from cdx/crawldiagnostics self.w_props=None # from cdx/warc self.trunc=self.fail=None self._host=self._path=None self.seed=False self.uu[s]=self @property def host(self): if self._host is None: (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s) return self._host @property def path(self): if self._path is None: (_,self._host,self._path,_,_,_)=urllib.parse(self.s) return self._path @classmethod def get(cls,s): return cls.uu.get(s,cls(s)) def __repr__(self): return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path) def typeString(self): return "%s%s%s"%('s' if u.seed else '', 'r' if self.w_props is not None else ( 'd' if self.w_props is not None else ( 'f' if self.fail is not None else '')), '' if self.trunc is None else self.trunc[0]) def readCDX(files,where): c=None res={} # Ref. https://github.com/ikreymer/webarchive-indexing for resFileName in files: with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: n=0 try: for c in rf: r=CDX.match(c) (host,path,props)=r.groups() d=json.loads(props) uri=d["url"] u=URI.get(uri) u.__dict__[where]=d u._host=host u._path=path res[uri]=u n+=1 except: print(resFileName,n,c,file=sys.stderr) raise #print (n,len(res),file=sys.stderr) return res seeds=0 for l in buf.readlines(): seeds+=1 u=URI(l.rstrip()) u.seed=True print('post-seed',len(URI.uu),file=sys.stderr) fetches=readCDX(wwff,'w_props') print('post-fetch',len(URI.uu),file=sys.stderr) diags=readCDX(ddff,'d_props') print('post-diag',len(URI.uu),file=sys.stderr) truncs=0 for l in wtf: if l.startswith('WARC-'): (k,rest)=l.split(' ',1) if k=='WARC-Target-URI:': uri=URI.uu[rest.rstrip()] # better be there... elif k=='WARC-Truncated:': truncs+=1 uri.trunc=rest.rstrip() fails=0 for l in log: r=FAIL.match(l) if r: fails+=1 (u,m2)=r.groups() URI.get(u).fail=m2 print(len(URI.uu),file=sys.stderr) print("""For %s/%s: %4s requested %4s retrieved %4s diagnosed %4s failed %4s truncated """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), file=sys.stderr) for u in URI.uu.values(): print(u.typeString())