Mercurial > hg > cc > cirrus_home
comparison bin/track.py @ 76:6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 14:25:44 +0100 |
parents | |
children | bfff01c139ea |
comparison
equal
deleted
inserted
replaced
75:1c5dab2e1cb3 | 76:6cf3dc7ff022 |
---|---|
1 #!/lustre/sw/miniconda3/bin/python3 | |
2 '''Track a list of URIs through nutch results''' | |
3 # Usage: track.py year-nn segmentid [file] | |
4 | |
5 import re,sys,glob,gzip,json | |
6 | |
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$") | |
8 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] | |
9 seg=sys.argv[2] | |
10 try: | |
11 if len(sys.argv)==4: | |
12 uuf=open(sys.argv[3]) | |
13 else: | |
14 uuf=sys.stdin | |
15 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) | |
16 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) | |
17 wwff=glob.glob(wwfp) | |
18 ddff=glob.glob(ddfp) | |
19 assert len(wwff)!=0,wwfp | |
20 assert len(ddff)!=0,ddfp | |
21 except: | |
22 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) | |
23 raise | |
24 | |
25 def readCDX(files): | |
26 res={} | |
27 c=None | |
28 # Ref. https://github.com/ikreymer/webarchive-indexing | |
29 for resFileName in files: | |
30 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: | |
31 n=0 | |
32 try: | |
33 for c in rf: | |
34 r=CDX.match(c) | |
35 (dom,path,props)=r.groups() | |
36 d=json.loads(props) | |
37 res[d["url"]]=d | |
38 n+=1 | |
39 except: | |
40 print(resFileName,n,c,file=sys.stderr) | |
41 raise | |
42 print (n,len(res)) | |
43 return res | |
44 | |
45 fetches=readCDX(wwff) | |
46 diags=readCDX(ddff) | |
47 | |
48 |