Mercurial > hg > cc > cirrus_home
changeset 76:6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 14:25:44 +0100 |
parents | 1c5dab2e1cb3 |
children | bfff01c139ea |
files | bin/track.py |
diffstat | 1 files changed, 48 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/track.py Wed May 06 14:25:44 2020 +0100 @@ -0,0 +1,48 @@ +#!/lustre/sw/miniconda3/bin/python3 +'''Track a list of URIs through nutch results''' +# Usage: track.py year-nn segmentid [file] + +import re,sys,glob,gzip,json + +CDX=re.compile("(.*)\)(.*) (\{.*\})$") +cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] +seg=sys.argv[2] +try: + if len(sys.argv)==4: + uuf=open(sys.argv[3]) + else: + uuf=sys.stdin + wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) + ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) + wwff=glob.glob(wwfp) + ddff=glob.glob(ddfp) + assert len(wwff)!=0,wwfp + assert len(ddff)!=0,ddfp +except: + print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) + raise + +def readCDX(files): + res={} + c=None + # Ref. https://github.com/ikreymer/webarchive-indexing + for resFileName in files: + with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: + n=0 + try: + for c in rf: + r=CDX.match(c) + (dom,path,props)=r.groups() + d=json.loads(props) + res[d["url"]]=d + n+=1 + except: + print(resFileName,n,c,file=sys.stderr) + raise + print (n,len(res)) + return res + +fetches=readCDX(wwff) +diags=readCDX(ddff) + +