Mercurial > hg > cc > cirrus_home
changeset 79:4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 May 2020 18:47:24 +0100 |
parents | 846b38f8b204 |
children | f494df0d34aa |
files | bin/track.py |
diffstat | 1 files changed, 62 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/track.py Thu May 07 11:33:24 2020 +0100 +++ b/bin/track.py Thu May 07 18:47:24 2020 +0100 @@ -4,7 +4,7 @@ import re,sys,glob,gzip,json,urllib.parse -CDX=re.compile("(.*)\)(.*) (\{.*\})$") +CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] @@ -31,42 +31,63 @@ class URI: uu={} - def __init__(self,s): + depth=0 + status='unknown' + def __init__(self,s,seed=False): self.s=s self.d_props=None # from cdx/crawldiagnostics self.w_props=None # from cdx/warc self.trunc=self.fail=None - self._host=self._path=None - self.seed=False + self._host=self._path=self._scheme=None + self.seed=seed self.uu[s]=self @property def host(self): if self._host is None: - (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s) + (self._scheme,self._host,self._path + ,_,_,_)=urllib.parse.urlparse(self.s) return self._host @property def path(self): if self._path is None: - (_,self._host,self._path,_,_,_)=urllib.parse(self.s) + (self._scheme,self._host,self._path, + _,_,_)=urllib.parse.urlparse(self.s) return self._path + @property + def scheme(self): + if self._scheme is None: + (self._scheme,self._host,self._path, + _,_,_)=urllib.parse.urlparse(self.s) + return self._scheme + @classmethod def get(cls,s): - return cls.uu.get(s,cls(s)) + try: + return cls.uu[s] + except KeyError: + return cls(s) def __repr__(self): - return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path) + prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20]) + plen=len(prefix) + suffix=('' if len(self.s)<plen else + '...'+self.s[-min(len(self.s)-plen,10):]) + return "<U%s>%s[%s%s]"%(self.typeString(), + (lambda x:'' if x=='2' else x)(self.status[0]), + prefix,suffix) def typeString(self): - return "%s%s%s"%('s' if u.seed else '', - 'r' if self.w_props is not None else ( - 'd' if self.w_props is not None else ( - 'f' if self.fail is not None else '')), - '' if self.trunc is None else self.trunc[0]) + return "%s%s%s%s"%('s' if self.seed else '', + 'r' if self.w_props is not None else ( + 'd' if self.d_props is not None else ( + 'f' if self.fail is not None else '')), + '' if self.depth==0 else self.depth, + '' if self.trunc is None else self.trunc[0]) -def readCDX(files,where): +def readCDX(files,where,status=None): c=None res={} # Ref. https://github.com/ikreymer/webarchive-indexing @@ -76,13 +97,16 @@ try: for c in rf: r=CDX.match(c) - (host,path,props)=r.groups() + (rdom,path,seg,props)=r.groups() d=json.loads(props) uri=d["url"] u=URI.get(uri) u.__dict__[where]=d - u._host=host - u._path=path + u.rdomstr=rdom # domain, reverse order, comma separated + u.lcpath=path # path, lower-cased, maybe %decoded? + u.seg=seg # partial warc identifier? + if status is not None: + u.status=status res[uri]=u n+=1 except: @@ -94,15 +118,28 @@ seeds=0 for l in buf.readlines(): seeds+=1 - u=URI(l.rstrip()) - u.seed=True + u=URI(l.rstrip(),True) -print('post-seed',len(URI.uu),file=sys.stderr) -fetches=readCDX(wwff,'w_props') +print('post-seed',seeds,file=sys.stderr) +fetches=readCDX(wwff,'w_props',"200") print('post-fetch',len(URI.uu),file=sys.stderr) diags=readCDX(ddff,'d_props') print('post-diag',len(URI.uu),file=sys.stderr) +BORKED="borked" + +for u in diags.values(): + u.status=u.d_props["status"] + if u.status[0]=='3': + try: + loc=u.d_props["redirect"] + r=URI.get(loc) + r.depth=u.depth+1 + r.source=u + u.reloc=r + except KeyError: + u.reloc=BORKED # something went wrong somewhere... + truncs=0 for l in wtf: if l.startswith('WARC-'): @@ -121,7 +158,7 @@ (u,m2)=r.groups() URI.get(u).fail=m2 -print(len(URI.uu),file=sys.stderr) +print('post-fail',len(URI.uu),file=sys.stderr) print("""For %s/%s: %4s requested @@ -132,5 +169,6 @@ """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), file=sys.stderr) -for u in URI.uu.values(): - print(u.typeString()) +if not sys.stdout.isatty(): + for u in URI.uu.values(): + print(u.typeString())