changeset 79:4b8e4e3d60eb

track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 May 2020 18:47:24 +0100
parents 846b38f8b204
children f494df0d34aa
files bin/track.py
diffstat 1 files changed, 62 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/bin/track.py	Thu May 07 11:33:24 2020 +0100
+++ b/bin/track.py	Thu May 07 18:47:24 2020 +0100
@@ -4,7 +4,7 @@
 
 import re,sys,glob,gzip,json,urllib.parse
 
-CDX=re.compile("(.*)\)(.*) (\{.*\})$")
+CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$")
 FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
 seg=sys.argv[2]
@@ -31,42 +31,63 @@
 
 class URI:
     uu={}
-    def __init__(self,s):
+    depth=0
+    status='unknown'
+    def __init__(self,s,seed=False):
         self.s=s
         self.d_props=None # from cdx/crawldiagnostics
         self.w_props=None # from cdx/warc
         self.trunc=self.fail=None
-        self._host=self._path=None
-        self.seed=False
+        self._host=self._path=self._scheme=None
+        self.seed=seed
         self.uu[s]=self
 
     @property
     def host(self):
         if self._host is None:
-            (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s)
+            (self._scheme,self._host,self._path
+             ,_,_,_)=urllib.parse.urlparse(self.s)
         return self._host
 
     @property
     def path(self):
         if self._path is None:
-            (_,self._host,self._path,_,_,_)=urllib.parse(self.s)
+            (self._scheme,self._host,self._path,
+             _,_,_)=urllib.parse.urlparse(self.s)
         return self._path
 
+    @property
+    def scheme(self):
+        if self._scheme is None:
+            (self._scheme,self._host,self._path,
+             _,_,_)=urllib.parse.urlparse(self.s)
+        return self._scheme
+
     @classmethod
     def get(cls,s):
-        return cls.uu.get(s,cls(s))
+        try:
+            return cls.uu[s]
+        except KeyError:
+            return cls(s)
 
     def __repr__(self):
-        return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path)
+        prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20])
+        plen=len(prefix)
+        suffix=('' if len(self.s)<plen else
+                '...'+self.s[-min(len(self.s)-plen,10):])
+        return "<U%s>%s[%s%s]"%(self.typeString(),
+                                 (lambda x:'' if x=='2' else x)(self.status[0]),
+                                 prefix,suffix)
 
     def typeString(self):
-        return "%s%s%s"%('s' if u.seed else '',
-                         'r' if self.w_props is not None else (
-                             'd' if self.w_props is not None else (
-                                 'f' if self.fail is not None else '')),
-                         '' if self.trunc is None else self.trunc[0])
+        return "%s%s%s%s"%('s' if self.seed else '',
+                           'r' if self.w_props is not None else (
+                               'd' if self.d_props is not None else (
+                                   'f' if self.fail is not None else '')),
+                           '' if self.depth==0 else self.depth,
+                           '' if self.trunc is None else self.trunc[0])
 
-def readCDX(files,where):
+def readCDX(files,where,status=None):
     c=None
     res={}
     # Ref. https://github.com/ikreymer/webarchive-indexing
@@ -76,13 +97,16 @@
             try:
                 for c in rf:
                     r=CDX.match(c)
-                    (host,path,props)=r.groups()
+                    (rdom,path,seg,props)=r.groups()
                     d=json.loads(props)
                     uri=d["url"]
                     u=URI.get(uri)
                     u.__dict__[where]=d
-                    u._host=host
-                    u._path=path
+                    u.rdomstr=rdom # domain, reverse order, comma separated
+                    u.lcpath=path # path, lower-cased, maybe %decoded?
+                    u.seg=seg # partial warc identifier?
+                    if status is not None:
+                        u.status=status
                     res[uri]=u
                     n+=1
             except:
@@ -94,15 +118,28 @@
 seeds=0
 for l in buf.readlines():
     seeds+=1
-    u=URI(l.rstrip())
-    u.seed=True
+    u=URI(l.rstrip(),True)
 
-print('post-seed',len(URI.uu),file=sys.stderr)
-fetches=readCDX(wwff,'w_props')
+print('post-seed',seeds,file=sys.stderr)
+fetches=readCDX(wwff,'w_props',"200")
 print('post-fetch',len(URI.uu),file=sys.stderr)
 diags=readCDX(ddff,'d_props')
 print('post-diag',len(URI.uu),file=sys.stderr)
 
+BORKED="borked"
+
+for u in diags.values():
+    u.status=u.d_props["status"]
+    if u.status[0]=='3':
+        try:
+            loc=u.d_props["redirect"]
+            r=URI.get(loc)
+            r.depth=u.depth+1
+            r.source=u
+            u.reloc=r
+        except KeyError:
+            u.reloc=BORKED # something went wrong somewhere...
+
 truncs=0
 for l in wtf:
     if l.startswith('WARC-'):
@@ -121,7 +158,7 @@
         (u,m2)=r.groups()
         URI.get(u).fail=m2
 
-print(len(URI.uu),file=sys.stderr)
+print('post-fail',len(URI.uu),file=sys.stderr)
 
 print("""For %s/%s:
  %4s requested
@@ -132,5 +169,6 @@
 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
       file=sys.stderr)
 
-for u in URI.uu.values():
-    print(u.typeString())
+if not sys.stdout.isatty():
+    for u in URI.uu.values():
+        print(u.typeString())