cc/cirrus_home: bin/track.py comparison

starting on tool to assemble as complete as we have info wrt a seed URI

author	Henry S. Thompson <ht@inf.ed.ac.uk>
date	Wed, 06 May 2020 14:25:44 +0100
parents
children	bfff01c139ea

comparison

equal deleted inserted replaced

-:1c5dab2e1cb3
+:6cf3dc7ff022
+#!/lustre/sw/miniconda3/bin/python3
+'''Track a list of URIs through nutch results'''
+# Usage: track.py year-nn segmentid [file]
+import re,sys,glob,gzip,json
+CDX=re.compile("(.*)\)(.*) (\{.*\})$")
+cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
+seg=sys.argv[2]
+try:
+if len(sys.argv)==4:
+uuf=open(sys.argv[3])
+else:
+uuf=sys.stdin
+wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
+ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
+wwff=glob.glob(wwfp)
+ddff=glob.glob(ddfp)
+assert len(wwff)!=0,wwfp
+assert len(ddff)!=0,ddfp
+except:
+print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
+raise
+def readCDX(files):
+res={}
+c=None
+# Ref. https://github.com/ikreymer/webarchive-indexing
+for resFileName in files:
+with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
+n=0
+try:
+for c in rf:
+r=CDX.match(c)
+(dom,path,props)=r.groups()
+d=json.loads(props)
+res[d["url"]]=d
+n+=1
+except:
+print(resFileName,n,c,file=sys.stderr)
+raise
+print (n,len(res))
+return res
+fetches=readCDX(wwff)
+diags=readCDX(ddff)

Mercurial > hg > cc > cirrus_home