# HG changeset patch # User Henry S. Thompson # Date 1589037388 -3600 # Node ID fcb390b3ea553963a1fc83e4852fabd83202acfe # Parent f494df0d34aa8c65f2ff913884190f226ec883f5 improved F handling/logging diff -r f494df0d34aa -r fcb390b3ea55 bin/track.py --- a/bin/track.py Fri May 08 19:52:36 2020 +0100 +++ b/bin/track.py Sat May 09 16:16:28 2020 +0100 @@ -5,7 +5,10 @@ import re,sys,glob,gzip,json,urllib.parse CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") -FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") +FAIL1a=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P.*) failed with: (?PHttp code=[^,]*)") +FAIL1b=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P.*) failed with: (?P[^ ]*):") +FAIL2=re.compile("....................... INFO fetcher\\.FetcherThread - (?PCrawl-Delay) for (?P.*) too long") +FAIL3=re.compile("....................... INFO fetcher\\.FetcherThread - (?PDenied) by robots.txt: (?P.*)$") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: @@ -79,14 +82,16 @@ prefix,suffix) def typeString(self): - return "%s%s%s%s"%('s' if self.seed else '', + return "%s%s%s%s%s"%('s' if self.seed else '', 'r' if self.w_props is not None else ( - '' if self.d_props is None else ( + ('d'+self.status[0]) + if self.d_props is not None else ''), ('l'+'.'.join(str(s[1]) for s in sorted(self.sources,key=lambda x:x[1]))) - if hasattr(self,'sources') else 'd'+self.status[0])), + if hasattr(self,'sources') else '', '' if self.trunc is None else self.trunc[0], - '' if self.fail is None else 'F') + '' if self.fail is None else ( + 'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail))) def readCDX(files,where,status=None): c=None @@ -136,7 +141,7 @@ if not hasattr(u,'status'): u.status='unknown' if u.d_props is None else u.d_props["status"] if source is not None: - bptr=(source,depth+1) + bptr=(source,depth) if hasattr(u,'sources'): u.sources.append(bptr) else: @@ -165,11 +170,13 @@ fails=0 for l in log: - r=FAIL.match(l) - if r: - fails+=1 - (u,m2)=r.groups() - URI.get(u).fail=m2 + for p in (FAIL1a,FAIL1b,FAIL2,FAIL3): + r=p.match(l) + if r: + fails+=1 + u=r.group('u') + m=r.group('m') + URI.get(u).fail=m print('post-fail',len(URI.uu),file=sys.stderr) sys.stderr.flush() @@ -177,10 +184,14 @@ print("""For %s/%s: %4s requested s %4s retrieved r - %4s diagnosed d/l - %4s failed F - %4s truncated rd/rt -"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), + %4s diagnosed d + %4s redirection-location l + %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay + D for robot denied} + %4s truncated r{d for disconnect, t for timeout} +"""%(cc,seg,seeds,len(fetches),len(diags), + sum(1 for u in URI.uu.values() if hasattr(u,'sources')), + fails,truncs), file=sys.stderr) sys.stderr.flush()