Mercurial > hg > cc > cirrus_home
changeset 78:846b38f8b204
refactor, change summary print (problem?)
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 May 2020 11:33:24 +0100 |
parents | bfff01c139ea |
children | 4b8e4e3d60eb |
files | .Xauthority .hgignore bin/track.py |
diffstat | 3 files changed, 77 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Wed May 06 18:28:52 2020 +0100 +++ b/.hgignore Thu May 07 11:33:24 2020 +0100 @@ -53,3 +53,4 @@ openjdk-8u252-b09 nutch-cc src/hadoop +.Xauthority
--- a/bin/track.py Wed May 06 18:28:52 2020 +0100 +++ b/bin/track.py Thu May 07 11:33:24 2020 +0100 @@ -2,7 +2,7 @@ '''Track a list of URIs through nutch results''' # Usage: track.py year-nn segmentid [file] -import re,sys,glob,gzip,json +import re,sys,glob,gzip,json,urllib.parse CDX=re.compile("(.*)\)(.*) (\{.*\})$") FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") @@ -26,9 +26,49 @@ print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) raise -def readCDX(files): +BU={} +U={} + +class URI: + uu={} + def __init__(self,s): + self.s=s + self.d_props=None # from cdx/crawldiagnostics + self.w_props=None # from cdx/warc + self.trunc=self.fail=None + self._host=self._path=None + self.seed=False + self.uu[s]=self + + @property + def host(self): + if self._host is None: + (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s) + return self._host + + @property + def path(self): + if self._path is None: + (_,self._host,self._path,_,_,_)=urllib.parse(self.s) + return self._path + + @classmethod + def get(cls,s): + return cls.uu.get(s,cls(s)) + + def __repr__(self): + return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path) + + def typeString(self): + return "%s%s%s"%('s' if u.seed else '', + 'r' if self.w_props is not None else ( + 'd' if self.w_props is not None else ( + 'f' if self.fail is not None else '')), + '' if self.trunc is None else self.trunc[0]) + +def readCDX(files,where): + c=None res={} - c=None # Ref. https://github.com/ikreymer/webarchive-indexing for resFileName in files: with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: @@ -36,9 +76,14 @@ try: for c in rf: r=CDX.match(c) - (dom,path,props)=r.groups() + (host,path,props)=r.groups() d=json.loads(props) - res[d["url"]]=d + uri=d["url"] + u=URI.get(uri) + u.__dict__[where]=d + u._host=host + u._path=path + res[uri]=u n+=1 except: print(resFileName,n,c,file=sys.stderr) @@ -46,37 +91,46 @@ #print (n,len(res),file=sys.stderr) return res -fetches=readCDX(wwff) -diags=readCDX(ddff) -trunc={} +seeds=0 +for l in buf.readlines(): + seeds+=1 + u=URI(l.rstrip()) + u.seed=True + +print('post-seed',len(URI.uu),file=sys.stderr) +fetches=readCDX(wwff,'w_props') +print('post-fetch',len(URI.uu),file=sys.stderr) +diags=readCDX(ddff,'d_props') +print('post-diag',len(URI.uu),file=sys.stderr) + +truncs=0 for l in wtf: if l.startswith('WARC-'): (k,rest)=l.split(' ',1) if k=='WARC-Target-URI:': - uri=rest.rstrip() + uri=URI.uu[rest.rstrip()] # better be there... elif k=='WARC-Truncated:': - trunc[uri]=rest.rstrip() -bu=list(map(str.rstrip,buf.readlines())) + truncs+=1 + uri.trunc=rest.rstrip() -fails={} +fails=0 for l in log: r=FAIL.match(l) if r: + fails+=1 (u,m2)=r.groups() - fails[u]=m2 + URI.get(u).fail=m2 + +print(len(URI.uu),file=sys.stderr) print("""For %s/%s: %4s requested %4s retrieved + %4s diagnosed + %4s failed %4s truncated - %4s diagnosed - %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)), +"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), file=sys.stderr) -for u in bu: - sig=0 - sig+=8 if u in fetches else 0 - sig+=4 if u in diags else 0 - sig+=2 if u in fails else 0 - sig+=1 if u in trunc else 0 - print(format(sig,'04b')) +for u in URI.uu.values(): + print(u.typeString())