changeset 78:846b38f8b204

refactor, change summary print (problem?)
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 May 2020 11:33:24 +0100
parents bfff01c139ea
children 4b8e4e3d60eb
files .Xauthority .hgignore bin/track.py
diffstat 3 files changed, 77 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
Binary file .Xauthority has changed
--- a/.hgignore	Wed May 06 18:28:52 2020 +0100
+++ b/.hgignore	Thu May 07 11:33:24 2020 +0100
@@ -53,3 +53,4 @@
 openjdk-8u252-b09
 nutch-cc
 src/hadoop
+.Xauthority
--- a/bin/track.py	Wed May 06 18:28:52 2020 +0100
+++ b/bin/track.py	Thu May 07 11:33:24 2020 +0100
@@ -2,7 +2,7 @@
 '''Track a list of URIs through nutch results'''
 # Usage: track.py year-nn segmentid [file]
 
-import re,sys,glob,gzip,json
+import re,sys,glob,gzip,json,urllib.parse
 
 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
 FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
@@ -26,9 +26,49 @@
     print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
     raise
 
-def readCDX(files):
+BU={}
+U={}
+
+class URI:
+    uu={}
+    def __init__(self,s):
+        self.s=s
+        self.d_props=None # from cdx/crawldiagnostics
+        self.w_props=None # from cdx/warc
+        self.trunc=self.fail=None
+        self._host=self._path=None
+        self.seed=False
+        self.uu[s]=self
+
+    @property
+    def host(self):
+        if self._host is None:
+            (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s)
+        return self._host
+
+    @property
+    def path(self):
+        if self._path is None:
+            (_,self._host,self._path,_,_,_)=urllib.parse(self.s)
+        return self._path
+
+    @classmethod
+    def get(cls,s):
+        return cls.uu.get(s,cls(s))
+
+    def __repr__(self):
+        return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path)
+
+    def typeString(self):
+        return "%s%s%s"%('s' if u.seed else '',
+                         'r' if self.w_props is not None else (
+                             'd' if self.w_props is not None else (
+                                 'f' if self.fail is not None else '')),
+                         '' if self.trunc is None else self.trunc[0])
+
+def readCDX(files,where):
+    c=None
     res={}
-    c=None
     # Ref. https://github.com/ikreymer/webarchive-indexing
     for resFileName in files:
         with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
@@ -36,9 +76,14 @@
             try:
                 for c in rf:
                     r=CDX.match(c)
-                    (dom,path,props)=r.groups()
+                    (host,path,props)=r.groups()
                     d=json.loads(props)
-                    res[d["url"]]=d
+                    uri=d["url"]
+                    u=URI.get(uri)
+                    u.__dict__[where]=d
+                    u._host=host
+                    u._path=path
+                    res[uri]=u
                     n+=1
             except:
                 print(resFileName,n,c,file=sys.stderr)
@@ -46,37 +91,46 @@
         #print (n,len(res),file=sys.stderr)
     return res
 
-fetches=readCDX(wwff)
-diags=readCDX(ddff)
-trunc={}
+seeds=0
+for l in buf.readlines():
+    seeds+=1
+    u=URI(l.rstrip())
+    u.seed=True
+
+print('post-seed',len(URI.uu),file=sys.stderr)
+fetches=readCDX(wwff,'w_props')
+print('post-fetch',len(URI.uu),file=sys.stderr)
+diags=readCDX(ddff,'d_props')
+print('post-diag',len(URI.uu),file=sys.stderr)
+
+truncs=0
 for l in wtf:
     if l.startswith('WARC-'):
         (k,rest)=l.split(' ',1)
         if k=='WARC-Target-URI:':
-            uri=rest.rstrip()
+            uri=URI.uu[rest.rstrip()] # better be there...
         elif k=='WARC-Truncated:':
-            trunc[uri]=rest.rstrip()
-bu=list(map(str.rstrip,buf.readlines()))
+            truncs+=1
+            uri.trunc=rest.rstrip()
 
-fails={}
+fails=0
 for l in log:
     r=FAIL.match(l)
     if r:
+        fails+=1
         (u,m2)=r.groups()
-        fails[u]=m2
+        URI.get(u).fail=m2
+
+print(len(URI.uu),file=sys.stderr)
 
 print("""For %s/%s:
  %4s requested
  %4s retrieved
+ %4s diagnosed
+ %4s failed
  %4s truncated
- %4s diagnosed
- %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)),
+"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
       file=sys.stderr)
 
-for u in bu:
-    sig=0
-    sig+=8 if u in fetches else 0
-    sig+=4 if u in diags else 0
-    sig+=2 if u in fails else 0
-    sig+=1 if u in trunc else 0
-    print(format(sig,'04b'))
+for u in URI.uu.values():
+    print(u.typeString())