cc/cirrus_home: bin/track.py comparison

comparison bin/track.py @ 78:846b38f8b204

refactor, change summary print (problem?)

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Thu, 07 May 2020 11:33:24 +0100
parents	bfff01c139ea
children	4b8e4e3d60eb

comparison

equal deleted inserted replaced

-:bfff01c139ea
+:846b38f8b204
 #!/lustre/sw/miniconda3/bin/python3
 '''Track a list of URIs through nutch results'''
 # Usage: track.py year-nn segmentid [file]
-import re,sys,glob,gzip,json
+import re,sys,glob,gzip,json,urllib.parse
 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
 FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
 seg=sys.argv[2]
 assert len(ddff)!=0,ddfp
 except:
 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
 raise
-def readCDX(files):
+BU={}
+U={}
+class URI:
+uu={}
+def __init__(self,s):
+self.s=s
+self.d_props=None # from cdx/crawldiagnostics
+self.w_props=None # from cdx/warc
+self.trunc=self.fail=None
+self._host=self._path=None
+self.seed=False
+self.uu[s]=self
+@property
+def host(self):
+if self._host is None:
+(_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s)
+return self._host
+@property
+def path(self):
+if self._path is None:
+(_,self._host,self._path,_,_,_)=urllib.parse(self.s)
+return self._path
+@classmethod
+def get(cls,s):
+return cls.uu.get(s,cls(s))
+def __repr__(self):
+return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path)
+def typeString(self):
+return "%s%s%s"%('s' if u.seed else '',
+'r' if self.w_props is not None else (
+'d' if self.w_props is not None else (
+'f' if self.fail is not None else '')),
+'' if self.trunc is None else self.trunc[0])
+def readCDX(files,where):
+c=None
 res={}
-c=None
 # Ref. https://github.com/ikreymer/webarchive-indexing
 for resFileName in files:
 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
 n=0
 try:
 for c in rf:
 r=CDX.match(c)
-(dom,path,props)=r.groups()
+(host,path,props)=r.groups()
 d=json.loads(props)
-res[d["url"]]=d
+uri=d["url"]
+u=URI.get(uri)
+u.__dict__[where]=d
+u._host=host
+u._path=path
+res[uri]=u
 n+=1
 except:
 print(resFileName,n,c,file=sys.stderr)
 raise
 #print (n,len(res),file=sys.stderr)
 return res
-fetches=readCDX(wwff)
+seeds=0
-diags=readCDX(ddff)
+for l in buf.readlines():
-trunc={}
+seeds+=1
+u=URI(l.rstrip())
+u.seed=True
+print('post-seed',len(URI.uu),file=sys.stderr)
+fetches=readCDX(wwff,'w_props')
+print('post-fetch',len(URI.uu),file=sys.stderr)
+diags=readCDX(ddff,'d_props')
+print('post-diag',len(URI.uu),file=sys.stderr)
+truncs=0
 for l in wtf:
 if l.startswith('WARC-'):
 (k,rest)=l.split(' ',1)
 if k=='WARC-Target-URI:':
-uri=rest.rstrip()
+uri=URI.uu[rest.rstrip()] # better be there...
 elif k=='WARC-Truncated:':
-trunc[uri]=rest.rstrip()
+truncs+=1
-bu=list(map(str.rstrip,buf.readlines()))
+uri.trunc=rest.rstrip()
-fails={}
+fails=0
 for l in log:
 r=FAIL.match(l)
 if r:
+fails+=1
 (u,m2)=r.groups()
-fails[u]=m2
+URI.get(u).fail=m2
+print(len(URI.uu),file=sys.stderr)
 print("""For %s/%s:
 %4s requested
 %4s retrieved
+%4s diagnosed
+%4s failed
 %4s truncated
-%4s diagnosed
+"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
-%4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)),
 file=sys.stderr)
-for u in bu:
+for u in URI.uu.values():
-sig=0
+print(u.typeString())
-sig+=8 if u in fetches else 0
-sig+=4 if u in diags else 0
-sig+=2 if u in fails else 0
-sig+=1 if u in trunc else 0
-print(format(sig,'04b'))

Mercurial > hg > cc > cirrus_home

comparison bin/track.py @ 78:846b38f8b204