Mercurial > hg > cc > cirrus_home
comparison bin/track.py @ 80:f494df0d34aa
keep separate antecedants separate, buggy?
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 08 May 2020 19:52:36 +0100 |
parents | 4b8e4e3d60eb |
children | fcb390b3ea55 |
comparison
equal
deleted
inserted
replaced
79:4b8e4e3d60eb | 80:f494df0d34aa |
---|---|
30 U={} | 30 U={} |
31 | 31 |
32 class URI: | 32 class URI: |
33 uu={} | 33 uu={} |
34 depth=0 | 34 depth=0 |
35 status='unknown' | |
36 def __init__(self,s,seed=False): | 35 def __init__(self,s,seed=False): |
37 self.s=s | 36 self.s=s |
38 self.d_props=None # from cdx/crawldiagnostics | 37 self.d_props=None # from cdx/crawldiagnostics |
39 self.w_props=None # from cdx/warc | 38 self.w_props=None # from cdx/warc |
40 self.trunc=self.fail=None | 39 self.trunc=self.fail=None |
80 prefix,suffix) | 79 prefix,suffix) |
81 | 80 |
82 def typeString(self): | 81 def typeString(self): |
83 return "%s%s%s%s"%('s' if self.seed else '', | 82 return "%s%s%s%s"%('s' if self.seed else '', |
84 'r' if self.w_props is not None else ( | 83 'r' if self.w_props is not None else ( |
85 'd' if self.d_props is not None else ( | 84 '' if self.d_props is None else ( |
86 'f' if self.fail is not None else '')), | 85 ('l'+'.'.join(str(s[1]) for s in |
87 '' if self.depth==0 else self.depth, | 86 sorted(self.sources,key=lambda x:x[1]))) |
88 '' if self.trunc is None else self.trunc[0]) | 87 if hasattr(self,'sources') else 'd'+self.status[0])), |
88 '' if self.trunc is None else self.trunc[0], | |
89 '' if self.fail is None else 'F') | |
89 | 90 |
90 def readCDX(files,where,status=None): | 91 def readCDX(files,where,status=None): |
91 c=None | 92 c=None |
92 res={} | 93 res={} |
93 # Ref. https://github.com/ikreymer/webarchive-indexing | 94 # Ref. https://github.com/ikreymer/webarchive-indexing |
119 for l in buf.readlines(): | 120 for l in buf.readlines(): |
120 seeds+=1 | 121 seeds+=1 |
121 u=URI(l.rstrip(),True) | 122 u=URI(l.rstrip(),True) |
122 | 123 |
123 print('post-seed',seeds,file=sys.stderr) | 124 print('post-seed',seeds,file=sys.stderr) |
125 sys.stderr.flush() | |
124 fetches=readCDX(wwff,'w_props',"200") | 126 fetches=readCDX(wwff,'w_props',"200") |
125 print('post-fetch',len(URI.uu),file=sys.stderr) | 127 print('post-fetch',len(URI.uu),file=sys.stderr) |
128 sys.stderr.flush() | |
126 diags=readCDX(ddff,'d_props') | 129 diags=readCDX(ddff,'d_props') |
127 print('post-diag',len(URI.uu),file=sys.stderr) | 130 print('post-diag',len(URI.uu),file=sys.stderr) |
131 sys.stderr.flush() | |
128 | 132 |
129 BORKED="borked" | 133 BORKED="borked" |
130 | 134 |
131 for u in diags.values(): | 135 def maybeTrack(u,source=None,depth=0): |
132 u.status=u.d_props["status"] | 136 if not hasattr(u,'status'): |
137 u.status='unknown' if u.d_props is None else u.d_props["status"] | |
138 if source is not None: | |
139 bptr=(source,depth+1) | |
140 if hasattr(u,'sources'): | |
141 u.sources.append(bptr) | |
142 else: | |
143 u.sources=[bptr] | |
133 if u.status[0]=='3': | 144 if u.status[0]=='3': |
134 try: | 145 try: |
135 loc=u.d_props["redirect"] | 146 loc=u.d_props["redirect"] |
136 r=URI.get(loc) | 147 r=URI.get(loc) |
137 r.depth=u.depth+1 | |
138 r.source=u | |
139 u.reloc=r | 148 u.reloc=r |
149 maybeTrack(r,source=u,depth=depth+1) | |
140 except KeyError: | 150 except KeyError: |
141 u.reloc=BORKED # something went wrong somewhere... | 151 u.reloc=BORKED # something went wrong somewhere... |
152 | |
153 for u in diags.values(): | |
154 maybeTrack(u) | |
142 | 155 |
143 truncs=0 | 156 truncs=0 |
144 for l in wtf: | 157 for l in wtf: |
145 if l.startswith('WARC-'): | 158 if l.startswith('WARC-'): |
146 (k,rest)=l.split(' ',1) | 159 (k,rest)=l.split(' ',1) |
157 fails+=1 | 170 fails+=1 |
158 (u,m2)=r.groups() | 171 (u,m2)=r.groups() |
159 URI.get(u).fail=m2 | 172 URI.get(u).fail=m2 |
160 | 173 |
161 print('post-fail',len(URI.uu),file=sys.stderr) | 174 print('post-fail',len(URI.uu),file=sys.stderr) |
175 sys.stderr.flush() | |
162 | 176 |
163 print("""For %s/%s: | 177 print("""For %s/%s: |
164 %4s requested | 178 %4s requested s |
165 %4s retrieved | 179 %4s retrieved r |
166 %4s diagnosed | 180 %4s diagnosed d/l |
167 %4s failed | 181 %4s failed F |
168 %4s truncated | 182 %4s truncated rd/rt |
169 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), | 183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), |
170 file=sys.stderr) | 184 file=sys.stderr) |
185 sys.stderr.flush() | |
171 | 186 |
172 if not sys.stdout.isatty(): | 187 if not sys.stdout.isatty(): |
173 for u in URI.uu.values(): | 188 for u in URI.uu.values(): |
174 print(u.typeString()) | 189 print(u.typeString()) |