Mercurial > hg > cc > cirrus_home
view bin/track.py @ 76:6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 14:25:44 +0100 |
parents | |
children | bfff01c139ea |
line wrap: on
line source
#!/lustre/sw/miniconda3/bin/python3 '''Track a list of URIs through nutch results''' # Usage: track.py year-nn segmentid [file] import re,sys,glob,gzip,json CDX=re.compile("(.*)\)(.*) (\{.*\})$") cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] seg=sys.argv[2] try: if len(sys.argv)==4: uuf=open(sys.argv[3]) else: uuf=sys.stdin wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) wwff=glob.glob(wwfp) ddff=glob.glob(ddfp) assert len(wwff)!=0,wwfp assert len(ddff)!=0,ddfp except: print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) raise def readCDX(files): res={} c=None # Ref. https://github.com/ikreymer/webarchive-indexing for resFileName in files: with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: n=0 try: for c in rf: r=CDX.match(c) (dom,path,props)=r.groups() d=json.loads(props) res[d["url"]]=d n+=1 except: print(resFileName,n,c,file=sys.stderr) raise print (n,len(res)) return res fetches=readCDX(wwff) diags=readCDX(ddff)