comparison bin/track.py @ 81:fcb390b3ea55

improved F handling/logging
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 09 May 2020 16:16:28 +0100
parents f494df0d34aa
children
comparison
equal deleted inserted replaced
80:f494df0d34aa 81:fcb390b3ea55
3 # Usage: track.py year-nn segmentid [file] 3 # Usage: track.py year-nn segmentid [file]
4 4
5 import re,sys,glob,gzip,json,urllib.parse 5 import re,sys,glob,gzip,json,urllib.parse
6 6
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") 7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$")
8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") 8 FAIL1a=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>Http code=[^,]*)")
9 FAIL1b=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>[^ ]*):")
10 FAIL2=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Crawl-Delay) for (?P<u>.*) too long")
11 FAIL3=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Denied) by robots.txt: (?P<u>.*)$")
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] 12 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
10 seg=sys.argv[2] 13 seg=sys.argv[2]
11 try: 14 try:
12 if len(sys.argv)==4: 15 if len(sys.argv)==4:
13 uuf=open(sys.argv[3]) 16 uuf=open(sys.argv[3])
77 return "<U%s>%s[%s%s]"%(self.typeString(), 80 return "<U%s>%s[%s%s]"%(self.typeString(),
78 (lambda x:'' if x=='2' else x)(self.status[0]), 81 (lambda x:'' if x=='2' else x)(self.status[0]),
79 prefix,suffix) 82 prefix,suffix)
80 83
81 def typeString(self): 84 def typeString(self):
82 return "%s%s%s%s"%('s' if self.seed else '', 85 return "%s%s%s%s%s"%('s' if self.seed else '',
83 'r' if self.w_props is not None else ( 86 'r' if self.w_props is not None else (
84 '' if self.d_props is None else ( 87 ('d'+self.status[0])
88 if self.d_props is not None else ''),
85 ('l'+'.'.join(str(s[1]) for s in 89 ('l'+'.'.join(str(s[1]) for s in
86 sorted(self.sources,key=lambda x:x[1]))) 90 sorted(self.sources,key=lambda x:x[1])))
87 if hasattr(self,'sources') else 'd'+self.status[0])), 91 if hasattr(self,'sources') else '',
88 '' if self.trunc is None else self.trunc[0], 92 '' if self.trunc is None else self.trunc[0],
89 '' if self.fail is None else 'F') 93 '' if self.fail is None else (
94 'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail)))
90 95
91 def readCDX(files,where,status=None): 96 def readCDX(files,where,status=None):
92 c=None 97 c=None
93 res={} 98 res={}
94 # Ref. https://github.com/ikreymer/webarchive-indexing 99 # Ref. https://github.com/ikreymer/webarchive-indexing
134 139
135 def maybeTrack(u,source=None,depth=0): 140 def maybeTrack(u,source=None,depth=0):
136 if not hasattr(u,'status'): 141 if not hasattr(u,'status'):
137 u.status='unknown' if u.d_props is None else u.d_props["status"] 142 u.status='unknown' if u.d_props is None else u.d_props["status"]
138 if source is not None: 143 if source is not None:
139 bptr=(source,depth+1) 144 bptr=(source,depth)
140 if hasattr(u,'sources'): 145 if hasattr(u,'sources'):
141 u.sources.append(bptr) 146 u.sources.append(bptr)
142 else: 147 else:
143 u.sources=[bptr] 148 u.sources=[bptr]
144 if u.status[0]=='3': 149 if u.status[0]=='3':
163 truncs+=1 168 truncs+=1
164 uri.trunc=rest.rstrip() 169 uri.trunc=rest.rstrip()
165 170
166 fails=0 171 fails=0
167 for l in log: 172 for l in log:
168 r=FAIL.match(l) 173 for p in (FAIL1a,FAIL1b,FAIL2,FAIL3):
169 if r: 174 r=p.match(l)
170 fails+=1 175 if r:
171 (u,m2)=r.groups() 176 fails+=1
172 URI.get(u).fail=m2 177 u=r.group('u')
178 m=r.group('m')
179 URI.get(u).fail=m
173 180
174 print('post-fail',len(URI.uu),file=sys.stderr) 181 print('post-fail',len(URI.uu),file=sys.stderr)
175 sys.stderr.flush() 182 sys.stderr.flush()
176 183
177 print("""For %s/%s: 184 print("""For %s/%s:
178 %4s requested s 185 %4s requested s
179 %4s retrieved r 186 %4s retrieved r
180 %4s diagnosed d/l 187 %4s diagnosed d
181 %4s failed F 188 %4s redirection-location l
182 %4s truncated rd/rt 189 %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay
183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), 190 D for robot denied}
191 %4s truncated r{d for disconnect, t for timeout}
192 """%(cc,seg,seeds,len(fetches),len(diags),
193 sum(1 for u in URI.uu.values() if hasattr(u,'sources')),
194 fails,truncs),
184 file=sys.stderr) 195 file=sys.stderr)
185 sys.stderr.flush() 196 sys.stderr.flush()
186 197
187 if not sys.stdout.isatty(): 198 if not sys.stdout.isatty():
188 for u in URI.uu.values(): 199 for u in URI.uu.values():