# HG changeset patch # User Henry S. Thompson # Date 1588771544 -3600 # Node ID 6cf3dc7ff0227d7e59b9b25bc7893a3ac10f961f # Parent 1c5dab2e1cb32ff2850a41d4cdf09790eea65e10 starting on tool to assemble as complete as we have info wrt a seed URI diff -r 1c5dab2e1cb3 -r 6cf3dc7ff022 bin/track.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/track.py Wed May 06 14:25:44 2020 +0100 @@ -0,0 +1,48 @@ +#!/lustre/sw/miniconda3/bin/python3 +'''Track a list of URIs through nutch results''' +# Usage: track.py year-nn segmentid [file] + +import re,sys,glob,gzip,json + +CDX=re.compile("(.*)\)(.*) (\{.*\})$") +cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] +seg=sys.argv[2] +try: + if len(sys.argv)==4: + uuf=open(sys.argv[3]) + else: + uuf=sys.stdin + wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) + ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) + wwff=glob.glob(wwfp) + ddff=glob.glob(ddfp) + assert len(wwff)!=0,wwfp + assert len(ddff)!=0,ddfp +except: + print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) + raise + +def readCDX(files): + res={} + c=None + # Ref. https://github.com/ikreymer/webarchive-indexing + for resFileName in files: + with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: + n=0 + try: + for c in rf: + r=CDX.match(c) + (dom,path,props)=r.groups() + d=json.loads(props) + res[d["url"]]=d + n+=1 + except: + print(resFileName,n,c,file=sys.stderr) + raise + print (n,len(res)) + return res + +fetches=readCDX(wwff) +diags=readCDX(ddff) + +