changeset 80:f494df0d34aa

keep separate antecedants separate, buggy?
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 May 2020 19:52:36 +0100
parents 4b8e4e3d60eb
children fcb390b3ea55
files bin/track.py
diffstat 1 files changed, 29 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/bin/track.py	Thu May 07 18:47:24 2020 +0100
+++ b/bin/track.py	Fri May 08 19:52:36 2020 +0100
@@ -32,7 +32,6 @@
 class URI:
     uu={}
     depth=0
-    status='unknown'
     def __init__(self,s,seed=False):
         self.s=s
         self.d_props=None # from cdx/crawldiagnostics
@@ -82,10 +81,12 @@
     def typeString(self):
         return "%s%s%s%s"%('s' if self.seed else '',
                            'r' if self.w_props is not None else (
-                               'd' if self.d_props is not None else (
-                                   'f' if self.fail is not None else '')),
-                           '' if self.depth==0 else self.depth,
-                           '' if self.trunc is None else self.trunc[0])
+                               '' if self.d_props is None else (
+                           ('l'+'.'.join(str(s[1]) for s in
+                                        sorted(self.sources,key=lambda x:x[1])))
+                                if hasattr(self,'sources') else 'd'+self.status[0])),
+                           '' if self.trunc is None else self.trunc[0],
+                           '' if self.fail is None else 'F')
 
 def readCDX(files,where,status=None):
     c=None
@@ -121,25 +122,37 @@
     u=URI(l.rstrip(),True)
 
 print('post-seed',seeds,file=sys.stderr)
+sys.stderr.flush()
 fetches=readCDX(wwff,'w_props',"200")
 print('post-fetch',len(URI.uu),file=sys.stderr)
+sys.stderr.flush()
 diags=readCDX(ddff,'d_props')
 print('post-diag',len(URI.uu),file=sys.stderr)
+sys.stderr.flush()
 
 BORKED="borked"
 
-for u in diags.values():
-    u.status=u.d_props["status"]
+def maybeTrack(u,source=None,depth=0):
+    if not hasattr(u,'status'):
+        u.status='unknown' if u.d_props is None else u.d_props["status"]
+    if source is not None:
+        bptr=(source,depth+1)
+        if hasattr(u,'sources'):
+            u.sources.append(bptr)
+        else:
+            u.sources=[bptr]
     if u.status[0]=='3':
         try:
             loc=u.d_props["redirect"]
             r=URI.get(loc)
-            r.depth=u.depth+1
-            r.source=u
             u.reloc=r
+            maybeTrack(r,source=u,depth=depth+1)
         except KeyError:
             u.reloc=BORKED # something went wrong somewhere...
 
+for u in diags.values():
+    maybeTrack(u)
+
 truncs=0
 for l in wtf:
     if l.startswith('WARC-'):
@@ -159,15 +172,17 @@
         URI.get(u).fail=m2
 
 print('post-fail',len(URI.uu),file=sys.stderr)
+sys.stderr.flush()
 
 print("""For %s/%s:
- %4s requested
- %4s retrieved
- %4s diagnosed
- %4s failed
- %4s truncated
+ %4s requested s
+ %4s retrieved r
+ %4s diagnosed d/l
+ %4s failed F
+ %4s truncated rd/rt
 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
       file=sys.stderr)
+sys.stderr.flush()
 
 if not sys.stdout.isatty():
     for u in URI.uu.values():