Mercurial > hg > cc > cirrus_home
comparison bin/track.py @ 81:fcb390b3ea55
improved F handling/logging
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Sat, 09 May 2020 16:16:28 +0100 |
parents | f494df0d34aa |
children |
comparison
equal
deleted
inserted
replaced
80:f494df0d34aa | 81:fcb390b3ea55 |
---|---|
3 # Usage: track.py year-nn segmentid [file] | 3 # Usage: track.py year-nn segmentid [file] |
4 | 4 |
5 import re,sys,glob,gzip,json,urllib.parse | 5 import re,sys,glob,gzip,json,urllib.parse |
6 | 6 |
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") | 7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") |
8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") | 8 FAIL1a=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>Http code=[^,]*)") |
9 FAIL1b=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>[^ ]*):") | |
10 FAIL2=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Crawl-Delay) for (?P<u>.*) too long") | |
11 FAIL3=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Denied) by robots.txt: (?P<u>.*)$") | |
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] | 12 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] |
10 seg=sys.argv[2] | 13 seg=sys.argv[2] |
11 try: | 14 try: |
12 if len(sys.argv)==4: | 15 if len(sys.argv)==4: |
13 uuf=open(sys.argv[3]) | 16 uuf=open(sys.argv[3]) |
77 return "<U%s>%s[%s%s]"%(self.typeString(), | 80 return "<U%s>%s[%s%s]"%(self.typeString(), |
78 (lambda x:'' if x=='2' else x)(self.status[0]), | 81 (lambda x:'' if x=='2' else x)(self.status[0]), |
79 prefix,suffix) | 82 prefix,suffix) |
80 | 83 |
81 def typeString(self): | 84 def typeString(self): |
82 return "%s%s%s%s"%('s' if self.seed else '', | 85 return "%s%s%s%s%s"%('s' if self.seed else '', |
83 'r' if self.w_props is not None else ( | 86 'r' if self.w_props is not None else ( |
84 '' if self.d_props is None else ( | 87 ('d'+self.status[0]) |
88 if self.d_props is not None else ''), | |
85 ('l'+'.'.join(str(s[1]) for s in | 89 ('l'+'.'.join(str(s[1]) for s in |
86 sorted(self.sources,key=lambda x:x[1]))) | 90 sorted(self.sources,key=lambda x:x[1]))) |
87 if hasattr(self,'sources') else 'd'+self.status[0])), | 91 if hasattr(self,'sources') else '', |
88 '' if self.trunc is None else self.trunc[0], | 92 '' if self.trunc is None else self.trunc[0], |
89 '' if self.fail is None else 'F') | 93 '' if self.fail is None else ( |
94 'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail))) | |
90 | 95 |
91 def readCDX(files,where,status=None): | 96 def readCDX(files,where,status=None): |
92 c=None | 97 c=None |
93 res={} | 98 res={} |
94 # Ref. https://github.com/ikreymer/webarchive-indexing | 99 # Ref. https://github.com/ikreymer/webarchive-indexing |
134 | 139 |
135 def maybeTrack(u,source=None,depth=0): | 140 def maybeTrack(u,source=None,depth=0): |
136 if not hasattr(u,'status'): | 141 if not hasattr(u,'status'): |
137 u.status='unknown' if u.d_props is None else u.d_props["status"] | 142 u.status='unknown' if u.d_props is None else u.d_props["status"] |
138 if source is not None: | 143 if source is not None: |
139 bptr=(source,depth+1) | 144 bptr=(source,depth) |
140 if hasattr(u,'sources'): | 145 if hasattr(u,'sources'): |
141 u.sources.append(bptr) | 146 u.sources.append(bptr) |
142 else: | 147 else: |
143 u.sources=[bptr] | 148 u.sources=[bptr] |
144 if u.status[0]=='3': | 149 if u.status[0]=='3': |
163 truncs+=1 | 168 truncs+=1 |
164 uri.trunc=rest.rstrip() | 169 uri.trunc=rest.rstrip() |
165 | 170 |
166 fails=0 | 171 fails=0 |
167 for l in log: | 172 for l in log: |
168 r=FAIL.match(l) | 173 for p in (FAIL1a,FAIL1b,FAIL2,FAIL3): |
169 if r: | 174 r=p.match(l) |
170 fails+=1 | 175 if r: |
171 (u,m2)=r.groups() | 176 fails+=1 |
172 URI.get(u).fail=m2 | 177 u=r.group('u') |
178 m=r.group('m') | |
179 URI.get(u).fail=m | |
173 | 180 |
174 print('post-fail',len(URI.uu),file=sys.stderr) | 181 print('post-fail',len(URI.uu),file=sys.stderr) |
175 sys.stderr.flush() | 182 sys.stderr.flush() |
176 | 183 |
177 print("""For %s/%s: | 184 print("""For %s/%s: |
178 %4s requested s | 185 %4s requested s |
179 %4s retrieved r | 186 %4s retrieved r |
180 %4s diagnosed d/l | 187 %4s diagnosed d |
181 %4s failed F | 188 %4s redirection-location l |
182 %4s truncated rd/rt | 189 %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay |
183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), | 190 D for robot denied} |
191 %4s truncated r{d for disconnect, t for timeout} | |
192 """%(cc,seg,seeds,len(fetches),len(diags), | |
193 sum(1 for u in URI.uu.values() if hasattr(u,'sources')), | |
194 fails,truncs), | |
184 file=sys.stderr) | 195 file=sys.stderr) |
185 sys.stderr.flush() | 196 sys.stderr.flush() |
186 | 197 |
187 if not sys.stdout.isatty(): | 198 if not sys.stdout.isatty(): |
188 for u in URI.uu.values(): | 199 for u in URI.uu.values(): |