Mercurial > hg > cc > cirrus_home
view bin/track.py @ 79:4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 May 2020 18:47:24 +0100 |
parents | 846b38f8b204 |
children | f494df0d34aa |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 '''Track a list of URIs through nutch results''' # Usage: track.py year-nn segmentid [file] import re,sys,glob,gzip,json,urllib.parse CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: if len(sys.argv)==4: uuf=open(sys.argv[3]) else: uuf=sys.stdin wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) wwff=glob.glob(wwfp) ddff=glob.glob(ddfp) wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) buf=open("%s/%s/bu.txt"%(cc,seg)) log=open("%s/%s/hadoop.log"%(cc,seg)) assert len(wwff)!=0,wwfp assert len(ddff)!=0,ddfp except: print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) raise BU={} U={} class URI: uu={} depth=0 status='unknown' def __init__(self,s,seed=False): self.s=s self.d_props=None # from cdx/crawldiagnostics self.w_props=None # from cdx/warc self.trunc=self.fail=None self._host=self._path=self._scheme=None self.seed=seed self.uu[s]=self @property def host(self): if self._host is None: (self._scheme,self._host,self._path ,_,_,_)=urllib.parse.urlparse(self.s) return self._host @property def path(self): if self._path is None: (self._scheme,self._host,self._path, _,_,_)=urllib.parse.urlparse(self.s) return self._path @property def scheme(self): if self._scheme is None: (self._scheme,self._host,self._path, _,_,_)=urllib.parse.urlparse(self.s) return self._scheme @classmethod def get(cls,s): try: return cls.uu[s] except KeyError: return cls(s) def __repr__(self): prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20]) plen=len(prefix) suffix=('' if len(self.s)<plen else '...'+self.s[-min(len(self.s)-plen,10):]) return "<U%s>%s[%s%s]"%(self.typeString(), (lambda x:'' if x=='2' else x)(self.status[0]), prefix,suffix) def typeString(self): return "%s%s%s%s"%('s' if self.seed else '', 'r' if self.w_props is not None else ( 'd' if self.d_props is not None else ( 'f' if self.fail is not None else '')), '' if self.depth==0 else self.depth, '' if self.trunc is None else self.trunc[0]) def readCDX(files,where,status=None): c=None res={} # Ref. https://github.com/ikreymer/webarchive-indexing for resFileName in files: with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: n=0 try: for c in rf: r=CDX.match(c) (rdom,path,seg,props)=r.groups() d=json.loads(props) uri=d["url"] u=URI.get(uri) u.__dict__[where]=d u.rdomstr=rdom # domain, reverse order, comma separated u.lcpath=path # path, lower-cased, maybe %decoded? u.seg=seg # partial warc identifier? if status is not None: u.status=status res[uri]=u n+=1 except: print(resFileName,n,c,file=sys.stderr) raise #print (n,len(res),file=sys.stderr) return res seeds=0 for l in buf.readlines(): seeds+=1 u=URI(l.rstrip(),True) print('post-seed',seeds,file=sys.stderr) fetches=readCDX(wwff,'w_props',"200") print('post-fetch',len(URI.uu),file=sys.stderr) diags=readCDX(ddff,'d_props') print('post-diag',len(URI.uu),file=sys.stderr) BORKED="borked" for u in diags.values(): u.status=u.d_props["status"] if u.status[0]=='3': try: loc=u.d_props["redirect"] r=URI.get(loc) r.depth=u.depth+1 r.source=u u.reloc=r except KeyError: u.reloc=BORKED # something went wrong somewhere... truncs=0 for l in wtf: if l.startswith('WARC-'): (k,rest)=l.split(' ',1) if k=='WARC-Target-URI:': uri=URI.uu[rest.rstrip()] # better be there... elif k=='WARC-Truncated:': truncs+=1 uri.trunc=rest.rstrip() fails=0 for l in log: r=FAIL.match(l) if r: fails+=1 (u,m2)=r.groups() URI.get(u).fail=m2 print('post-fail',len(URI.uu),file=sys.stderr) print("""For %s/%s: %4s requested %4s retrieved %4s diagnosed %4s failed %4s truncated """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), file=sys.stderr) if not sys.stdout.isatty(): for u in URI.uu.values(): print(u.typeString())