comparison bin/track.py @ 76:6cf3dc7ff022

starting on tool to assemble as complete as we have info wrt a seed URI
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:25:44 +0100
parents
children bfff01c139ea
comparison
equal deleted inserted replaced
75:1c5dab2e1cb3 76:6cf3dc7ff022
1 #!/lustre/sw/miniconda3/bin/python3
2 '''Track a list of URIs through nutch results'''
3 # Usage: track.py year-nn segmentid [file]
4
5 import re,sys,glob,gzip,json
6
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
8 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
9 seg=sys.argv[2]
10 try:
11 if len(sys.argv)==4:
12 uuf=open(sys.argv[3])
13 else:
14 uuf=sys.stdin
15 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
16 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
17 wwff=glob.glob(wwfp)
18 ddff=glob.glob(ddfp)
19 assert len(wwff)!=0,wwfp
20 assert len(ddff)!=0,ddfp
21 except:
22 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
23 raise
24
25 def readCDX(files):
26 res={}
27 c=None
28 # Ref. https://github.com/ikreymer/webarchive-indexing
29 for resFileName in files:
30 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
31 n=0
32 try:
33 for c in rf:
34 r=CDX.match(c)
35 (dom,path,props)=r.groups()
36 d=json.loads(props)
37 res[d["url"]]=d
38 n+=1
39 except:
40 print(resFileName,n,c,file=sys.stderr)
41 raise
42 print (n,len(res))
43 return res
44
45 fetches=readCDX(wwff)
46 diags=readCDX(ddff)
47
48