changeset 81:fcb390b3ea55

improved F handling/logging
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Sat, 09 May 2020 16:16:28 +0100
parents f494df0d34aa
children 6304d5c46229
files bin/track.py
diffstat 1 files changed, 26 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/bin/track.py	Fri May 08 19:52:36 2020 +0100
+++ b/bin/track.py	Sat May 09 16:16:28 2020 +0100
@@ -5,7 +5,10 @@
 import re,sys,glob,gzip,json,urllib.parse
 
 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$")
-FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
+FAIL1a=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>Http code=[^,]*)")
+FAIL1b=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>[^ ]*):")
+FAIL2=re.compile("....................... INFO  fetcher\\.FetcherThread - (?P<m>Crawl-Delay) for (?P<u>.*) too long")
+FAIL3=re.compile("....................... INFO  fetcher\\.FetcherThread - (?P<m>Denied) by robots.txt: (?P<u>.*)$")
 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
 seg=sys.argv[2]
 try:
@@ -79,14 +82,16 @@
                                  prefix,suffix)
 
     def typeString(self):
-        return "%s%s%s%s"%('s' if self.seed else '',
+        return "%s%s%s%s%s"%('s' if self.seed else '',
                            'r' if self.w_props is not None else (
-                               '' if self.d_props is None else (
+                               ('d'+self.status[0])
+                                  if self.d_props is not None else ''),
                            ('l'+'.'.join(str(s[1]) for s in
                                         sorted(self.sources,key=lambda x:x[1])))
-                                if hasattr(self,'sources') else 'd'+self.status[0])),
+                                if hasattr(self,'sources') else '',
                            '' if self.trunc is None else self.trunc[0],
-                           '' if self.fail is None else 'F')
+                           '' if self.fail is None else (
+                               'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail)))
 
 def readCDX(files,where,status=None):
     c=None
@@ -136,7 +141,7 @@
     if not hasattr(u,'status'):
         u.status='unknown' if u.d_props is None else u.d_props["status"]
     if source is not None:
-        bptr=(source,depth+1)
+        bptr=(source,depth)
         if hasattr(u,'sources'):
             u.sources.append(bptr)
         else:
@@ -165,11 +170,13 @@
 
 fails=0
 for l in log:
-    r=FAIL.match(l)
-    if r:
-        fails+=1
-        (u,m2)=r.groups()
-        URI.get(u).fail=m2
+    for p in (FAIL1a,FAIL1b,FAIL2,FAIL3):
+        r=p.match(l)
+        if r:
+            fails+=1
+            u=r.group('u')
+            m=r.group('m')
+            URI.get(u).fail=m
 
 print('post-fail',len(URI.uu),file=sys.stderr)
 sys.stderr.flush()
@@ -177,10 +184,14 @@
 print("""For %s/%s:
  %4s requested s
  %4s retrieved r
- %4s diagnosed d/l
- %4s failed F
- %4s truncated rd/rt
-"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
+ %4s diagnosed d
+ %4s redirection-location l
+ %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay
+              D for robot denied}
+ %4s truncated r{d for disconnect, t for timeout}
+"""%(cc,seg,seeds,len(fetches),len(diags),
+     sum(1 for u in URI.uu.values() if hasattr(u,'sources')),
+         fails,truncs),
       file=sys.stderr)
 sys.stderr.flush()