Mercurial > hg > cc > cirrus_home
annotate bin/track.py @ 76:6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 May 2020 14:25:44 +0100 |
parents | |
children | bfff01c139ea |
rev | line source |
---|---|
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/lustre/sw/miniconda3/bin/python3 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Track a list of URIs through nutch results''' |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Usage: track.py year-nn segmentid [file] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 import re,sys,glob,gzip,json |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$") |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 seg=sys.argv[2] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 if len(sys.argv)==4: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 uuf=open(sys.argv[3]) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 else: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 uuf=sys.stdin |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 wwff=glob.glob(wwfp) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 ddff=glob.glob(ddfp) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 assert len(wwff)!=0,wwfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 assert len(ddff)!=0,ddfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 raise |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 def readCDX(files): |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 res={} |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 c=None |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 # Ref. https://github.com/ikreymer/webarchive-indexing |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 for resFileName in files: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 n=0 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 for c in rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 r=CDX.match(c) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 (dom,path,props)=r.groups() |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 d=json.loads(props) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 res[d["url"]]=d |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 n+=1 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 print(resFileName,n,c,file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 raise |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 print (n,len(res)) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
43 return res |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 fetches=readCDX(wwff) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 diags=readCDX(ddff) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 |