annotate bin/track.py @ 76:6cf3dc7ff022

starting on tool to assemble as complete as we have info wrt a seed URI
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 May 2020 14:25:44 +0100
parents
children bfff01c139ea
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/lustre/sw/miniconda3/bin/python3
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Track a list of URIs through nutch results'''
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Usage: track.py year-nn segmentid [file]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import re,sys,glob,gzip,json
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 seg=sys.argv[2]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 if len(sys.argv)==4:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 uuf=open(sys.argv[3])
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 else:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 uuf=sys.stdin
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 wwff=glob.glob(wwfp)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 ddff=glob.glob(ddfp)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 assert len(wwff)!=0,wwfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 assert len(ddff)!=0,ddfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 raise
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 def readCDX(files):
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 res={}
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 c=None
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 # Ref. https://github.com/ikreymer/webarchive-indexing
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 for resFileName in files:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 n=0
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 for c in rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 r=CDX.match(c)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 (dom,path,props)=r.groups()
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 d=json.loads(props)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 res[d["url"]]=d
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 n+=1
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 print(resFileName,n,c,file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 raise
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 print (n,len(res))
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 return res
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 fetches=readCDX(wwff)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 diags=readCDX(ddff)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48