Mercurial > hg > cc > cirrus_home
diff bin/track.py @ 80:f494df0d34aa
keep separate antecedants separate, buggy?
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 08 May 2020 19:52:36 +0100 |
parents | 4b8e4e3d60eb |
children | fcb390b3ea55 |
line wrap: on
line diff
--- a/bin/track.py Thu May 07 18:47:24 2020 +0100 +++ b/bin/track.py Fri May 08 19:52:36 2020 +0100 @@ -32,7 +32,6 @@ class URI: uu={} depth=0 - status='unknown' def __init__(self,s,seed=False): self.s=s self.d_props=None # from cdx/crawldiagnostics @@ -82,10 +81,12 @@ def typeString(self): return "%s%s%s%s"%('s' if self.seed else '', 'r' if self.w_props is not None else ( - 'd' if self.d_props is not None else ( - 'f' if self.fail is not None else '')), - '' if self.depth==0 else self.depth, - '' if self.trunc is None else self.trunc[0]) + '' if self.d_props is None else ( + ('l'+'.'.join(str(s[1]) for s in + sorted(self.sources,key=lambda x:x[1]))) + if hasattr(self,'sources') else 'd'+self.status[0])), + '' if self.trunc is None else self.trunc[0], + '' if self.fail is None else 'F') def readCDX(files,where,status=None): c=None @@ -121,25 +122,37 @@ u=URI(l.rstrip(),True) print('post-seed',seeds,file=sys.stderr) +sys.stderr.flush() fetches=readCDX(wwff,'w_props',"200") print('post-fetch',len(URI.uu),file=sys.stderr) +sys.stderr.flush() diags=readCDX(ddff,'d_props') print('post-diag',len(URI.uu),file=sys.stderr) +sys.stderr.flush() BORKED="borked" -for u in diags.values(): - u.status=u.d_props["status"] +def maybeTrack(u,source=None,depth=0): + if not hasattr(u,'status'): + u.status='unknown' if u.d_props is None else u.d_props["status"] + if source is not None: + bptr=(source,depth+1) + if hasattr(u,'sources'): + u.sources.append(bptr) + else: + u.sources=[bptr] if u.status[0]=='3': try: loc=u.d_props["redirect"] r=URI.get(loc) - r.depth=u.depth+1 - r.source=u u.reloc=r + maybeTrack(r,source=u,depth=depth+1) except KeyError: u.reloc=BORKED # something went wrong somewhere... +for u in diags.values(): + maybeTrack(u) + truncs=0 for l in wtf: if l.startswith('WARC-'): @@ -159,15 +172,17 @@ URI.get(u).fail=m2 print('post-fail',len(URI.uu),file=sys.stderr) +sys.stderr.flush() print("""For %s/%s: - %4s requested - %4s retrieved - %4s diagnosed - %4s failed - %4s truncated + %4s requested s + %4s retrieved r + %4s diagnosed d/l + %4s failed F + %4s truncated rd/rt """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), file=sys.stderr) +sys.stderr.flush() if not sys.stdout.isatty(): for u in URI.uu.values():