comparison bin/track.py @ 80:f494df0d34aa

keep separate antecedants separate, buggy?
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 May 2020 19:52:36 +0100
parents 4b8e4e3d60eb
children fcb390b3ea55
comparison
equal deleted inserted replaced
79:4b8e4e3d60eb 80:f494df0d34aa
30 U={} 30 U={}
31 31
32 class URI: 32 class URI:
33 uu={} 33 uu={}
34 depth=0 34 depth=0
35 status='unknown'
36 def __init__(self,s,seed=False): 35 def __init__(self,s,seed=False):
37 self.s=s 36 self.s=s
38 self.d_props=None # from cdx/crawldiagnostics 37 self.d_props=None # from cdx/crawldiagnostics
39 self.w_props=None # from cdx/warc 38 self.w_props=None # from cdx/warc
40 self.trunc=self.fail=None 39 self.trunc=self.fail=None
80 prefix,suffix) 79 prefix,suffix)
81 80
82 def typeString(self): 81 def typeString(self):
83 return "%s%s%s%s"%('s' if self.seed else '', 82 return "%s%s%s%s"%('s' if self.seed else '',
84 'r' if self.w_props is not None else ( 83 'r' if self.w_props is not None else (
85 'd' if self.d_props is not None else ( 84 '' if self.d_props is None else (
86 'f' if self.fail is not None else '')), 85 ('l'+'.'.join(str(s[1]) for s in
87 '' if self.depth==0 else self.depth, 86 sorted(self.sources,key=lambda x:x[1])))
88 '' if self.trunc is None else self.trunc[0]) 87 if hasattr(self,'sources') else 'd'+self.status[0])),
88 '' if self.trunc is None else self.trunc[0],
89 '' if self.fail is None else 'F')
89 90
90 def readCDX(files,where,status=None): 91 def readCDX(files,where,status=None):
91 c=None 92 c=None
92 res={} 93 res={}
93 # Ref. https://github.com/ikreymer/webarchive-indexing 94 # Ref. https://github.com/ikreymer/webarchive-indexing
119 for l in buf.readlines(): 120 for l in buf.readlines():
120 seeds+=1 121 seeds+=1
121 u=URI(l.rstrip(),True) 122 u=URI(l.rstrip(),True)
122 123
123 print('post-seed',seeds,file=sys.stderr) 124 print('post-seed',seeds,file=sys.stderr)
125 sys.stderr.flush()
124 fetches=readCDX(wwff,'w_props',"200") 126 fetches=readCDX(wwff,'w_props',"200")
125 print('post-fetch',len(URI.uu),file=sys.stderr) 127 print('post-fetch',len(URI.uu),file=sys.stderr)
128 sys.stderr.flush()
126 diags=readCDX(ddff,'d_props') 129 diags=readCDX(ddff,'d_props')
127 print('post-diag',len(URI.uu),file=sys.stderr) 130 print('post-diag',len(URI.uu),file=sys.stderr)
131 sys.stderr.flush()
128 132
129 BORKED="borked" 133 BORKED="borked"
130 134
131 for u in diags.values(): 135 def maybeTrack(u,source=None,depth=0):
132 u.status=u.d_props["status"] 136 if not hasattr(u,'status'):
137 u.status='unknown' if u.d_props is None else u.d_props["status"]
138 if source is not None:
139 bptr=(source,depth+1)
140 if hasattr(u,'sources'):
141 u.sources.append(bptr)
142 else:
143 u.sources=[bptr]
133 if u.status[0]=='3': 144 if u.status[0]=='3':
134 try: 145 try:
135 loc=u.d_props["redirect"] 146 loc=u.d_props["redirect"]
136 r=URI.get(loc) 147 r=URI.get(loc)
137 r.depth=u.depth+1
138 r.source=u
139 u.reloc=r 148 u.reloc=r
149 maybeTrack(r,source=u,depth=depth+1)
140 except KeyError: 150 except KeyError:
141 u.reloc=BORKED # something went wrong somewhere... 151 u.reloc=BORKED # something went wrong somewhere...
152
153 for u in diags.values():
154 maybeTrack(u)
142 155
143 truncs=0 156 truncs=0
144 for l in wtf: 157 for l in wtf:
145 if l.startswith('WARC-'): 158 if l.startswith('WARC-'):
146 (k,rest)=l.split(' ',1) 159 (k,rest)=l.split(' ',1)
157 fails+=1 170 fails+=1
158 (u,m2)=r.groups() 171 (u,m2)=r.groups()
159 URI.get(u).fail=m2 172 URI.get(u).fail=m2
160 173
161 print('post-fail',len(URI.uu),file=sys.stderr) 174 print('post-fail',len(URI.uu),file=sys.stderr)
175 sys.stderr.flush()
162 176
163 print("""For %s/%s: 177 print("""For %s/%s:
164 %4s requested 178 %4s requested s
165 %4s retrieved 179 %4s retrieved r
166 %4s diagnosed 180 %4s diagnosed d/l
167 %4s failed 181 %4s failed F
168 %4s truncated 182 %4s truncated rd/rt
169 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), 183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
170 file=sys.stderr) 184 file=sys.stderr)
185 sys.stderr.flush()
171 186
172 if not sys.stdout.isatty(): 187 if not sys.stdout.isatty():
173 for u in URI.uu.values(): 188 for u in URI.uu.values():
174 print(u.typeString()) 189 print(u.typeString())