view bin/track.py @ 76:6cf3dc7ff022

starting on tool to assemble as complete as we have info wrt a seed URI
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:25:44 +0100
parents
children bfff01c139ea
line wrap: on
line source

#!/lustre/sw/miniconda3/bin/python3
'''Track a list of URIs through nutch results'''
# Usage: track.py year-nn segmentid [file]

import re,sys,glob,gzip,json

CDX=re.compile("(.*)\)(.*) (\{.*\})$")
cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
seg=sys.argv[2]
try:
    if len(sys.argv)==4:
        uuf=open(sys.argv[3])
    else:
        uuf=sys.stdin
    wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
    ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
    wwff=glob.glob(wwfp)
    ddff=glob.glob(ddfp)
    assert len(wwff)!=0,wwfp
    assert len(ddff)!=0,ddfp
except:
    print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
    raise

def readCDX(files):
    res={}
    c=None
    # Ref. https://github.com/ikreymer/webarchive-indexing
    for resFileName in files:
        with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
            n=0
            try:
                for c in rf:
                    r=CDX.match(c)
                    (dom,path,props)=r.groups()
                    d=json.loads(props)
                    res[d["url"]]=d
                    n+=1
            except:
                print(resFileName,n,c,file=sys.stderr)
                raise
        print (n,len(res))
    return res

fetches=readCDX(wwff)
diags=readCDX(ddff)