view bin/track.py @ 78:846b38f8b204

refactor, change summary print (problem?)
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 May 2020 11:33:24 +0100
parents bfff01c139ea
children 4b8e4e3d60eb
line wrap: on
line source

#!/lustre/sw/miniconda3/bin/python3
'''Track a list of URIs through nutch results'''
# Usage: track.py year-nn segmentid [file]

import re,sys,glob,gzip,json,urllib.parse

CDX=re.compile("(.*)\)(.*) (\{.*\})$")
FAIL=re.compile("....................... INFO  fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
seg=sys.argv[2]
try:
    if len(sys.argv)==4:
        uuf=open(sys.argv[3])
    else:
        uuf=sys.stdin
    wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
    ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
    wwff=glob.glob(wwfp)
    ddff=glob.glob(ddfp)
    wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg))
    buf=open("%s/%s/bu.txt"%(cc,seg))
    log=open("%s/%s/hadoop.log"%(cc,seg))
    assert len(wwff)!=0,wwfp
    assert len(ddff)!=0,ddfp
except:
    print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
    raise

BU={}
U={}

class URI:
    uu={}
    def __init__(self,s):
        self.s=s
        self.d_props=None # from cdx/crawldiagnostics
        self.w_props=None # from cdx/warc
        self.trunc=self.fail=None
        self._host=self._path=None
        self.seed=False
        self.uu[s]=self

    @property
    def host(self):
        if self._host is None:
            (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s)
        return self._host

    @property
    def path(self):
        if self._path is None:
            (_,self._host,self._path,_,_,_)=urllib.parse(self.s)
        return self._path

    @classmethod
    def get(cls,s):
        return cls.uu.get(s,cls(s))

    def __repr__(self):
        return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path)

    def typeString(self):
        return "%s%s%s"%('s' if u.seed else '',
                         'r' if self.w_props is not None else (
                             'd' if self.w_props is not None else (
                                 'f' if self.fail is not None else '')),
                         '' if self.trunc is None else self.trunc[0])

def readCDX(files,where):
    c=None
    res={}
    # Ref. https://github.com/ikreymer/webarchive-indexing
    for resFileName in files:
        with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
            n=0
            try:
                for c in rf:
                    r=CDX.match(c)
                    (host,path,props)=r.groups()
                    d=json.loads(props)
                    uri=d["url"]
                    u=URI.get(uri)
                    u.__dict__[where]=d
                    u._host=host
                    u._path=path
                    res[uri]=u
                    n+=1
            except:
                print(resFileName,n,c,file=sys.stderr)
                raise
        #print (n,len(res),file=sys.stderr)
    return res

seeds=0
for l in buf.readlines():
    seeds+=1
    u=URI(l.rstrip())
    u.seed=True

print('post-seed',len(URI.uu),file=sys.stderr)
fetches=readCDX(wwff,'w_props')
print('post-fetch',len(URI.uu),file=sys.stderr)
diags=readCDX(ddff,'d_props')
print('post-diag',len(URI.uu),file=sys.stderr)

truncs=0
for l in wtf:
    if l.startswith('WARC-'):
        (k,rest)=l.split(' ',1)
        if k=='WARC-Target-URI:':
            uri=URI.uu[rest.rstrip()] # better be there...
        elif k=='WARC-Truncated:':
            truncs+=1
            uri.trunc=rest.rstrip()

fails=0
for l in log:
    r=FAIL.match(l)
    if r:
        fails+=1
        (u,m2)=r.groups()
        URI.get(u).fail=m2

print(len(URI.uu),file=sys.stderr)

print("""For %s/%s:
 %4s requested
 %4s retrieved
 %4s diagnosed
 %4s failed
 %4s truncated
"""%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
      file=sys.stderr)

for u in URI.uu.values():
    print(u.typeString())