Mercurial > hg > cc > cirrus_home
comparison bin/track.py @ 78:846b38f8b204
refactor, change summary print (problem?)
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 07 May 2020 11:33:24 +0100 |
parents | bfff01c139ea |
children | 4b8e4e3d60eb |
comparison
equal
deleted
inserted
replaced
77:bfff01c139ea | 78:846b38f8b204 |
---|---|
1 #!/lustre/sw/miniconda3/bin/python3 | 1 #!/lustre/sw/miniconda3/bin/python3 |
2 '''Track a list of URIs through nutch results''' | 2 '''Track a list of URIs through nutch results''' |
3 # Usage: track.py year-nn segmentid [file] | 3 # Usage: track.py year-nn segmentid [file] |
4 | 4 |
5 import re,sys,glob,gzip,json | 5 import re,sys,glob,gzip,json,urllib.parse |
6 | 6 |
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$") | 7 CDX=re.compile("(.*)\)(.*) (\{.*\})$") |
8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") | 8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") |
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] | 9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] |
10 seg=sys.argv[2] | 10 seg=sys.argv[2] |
24 assert len(ddff)!=0,ddfp | 24 assert len(ddff)!=0,ddfp |
25 except: | 25 except: |
26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) | 26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) |
27 raise | 27 raise |
28 | 28 |
29 def readCDX(files): | 29 BU={} |
30 U={} | |
31 | |
32 class URI: | |
33 uu={} | |
34 def __init__(self,s): | |
35 self.s=s | |
36 self.d_props=None # from cdx/crawldiagnostics | |
37 self.w_props=None # from cdx/warc | |
38 self.trunc=self.fail=None | |
39 self._host=self._path=None | |
40 self.seed=False | |
41 self.uu[s]=self | |
42 | |
43 @property | |
44 def host(self): | |
45 if self._host is None: | |
46 (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s) | |
47 return self._host | |
48 | |
49 @property | |
50 def path(self): | |
51 if self._path is None: | |
52 (_,self._host,self._path,_,_,_)=urllib.parse(self.s) | |
53 return self._path | |
54 | |
55 @classmethod | |
56 def get(cls,s): | |
57 return cls.uu.get(s,cls(s)) | |
58 | |
59 def __repr__(self): | |
60 return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path) | |
61 | |
62 def typeString(self): | |
63 return "%s%s%s"%('s' if u.seed else '', | |
64 'r' if self.w_props is not None else ( | |
65 'd' if self.w_props is not None else ( | |
66 'f' if self.fail is not None else '')), | |
67 '' if self.trunc is None else self.trunc[0]) | |
68 | |
69 def readCDX(files,where): | |
70 c=None | |
30 res={} | 71 res={} |
31 c=None | |
32 # Ref. https://github.com/ikreymer/webarchive-indexing | 72 # Ref. https://github.com/ikreymer/webarchive-indexing |
33 for resFileName in files: | 73 for resFileName in files: |
34 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: | 74 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: |
35 n=0 | 75 n=0 |
36 try: | 76 try: |
37 for c in rf: | 77 for c in rf: |
38 r=CDX.match(c) | 78 r=CDX.match(c) |
39 (dom,path,props)=r.groups() | 79 (host,path,props)=r.groups() |
40 d=json.loads(props) | 80 d=json.loads(props) |
41 res[d["url"]]=d | 81 uri=d["url"] |
82 u=URI.get(uri) | |
83 u.__dict__[where]=d | |
84 u._host=host | |
85 u._path=path | |
86 res[uri]=u | |
42 n+=1 | 87 n+=1 |
43 except: | 88 except: |
44 print(resFileName,n,c,file=sys.stderr) | 89 print(resFileName,n,c,file=sys.stderr) |
45 raise | 90 raise |
46 #print (n,len(res),file=sys.stderr) | 91 #print (n,len(res),file=sys.stderr) |
47 return res | 92 return res |
48 | 93 |
49 fetches=readCDX(wwff) | 94 seeds=0 |
50 diags=readCDX(ddff) | 95 for l in buf.readlines(): |
51 trunc={} | 96 seeds+=1 |
97 u=URI(l.rstrip()) | |
98 u.seed=True | |
99 | |
100 print('post-seed',len(URI.uu),file=sys.stderr) | |
101 fetches=readCDX(wwff,'w_props') | |
102 print('post-fetch',len(URI.uu),file=sys.stderr) | |
103 diags=readCDX(ddff,'d_props') | |
104 print('post-diag',len(URI.uu),file=sys.stderr) | |
105 | |
106 truncs=0 | |
52 for l in wtf: | 107 for l in wtf: |
53 if l.startswith('WARC-'): | 108 if l.startswith('WARC-'): |
54 (k,rest)=l.split(' ',1) | 109 (k,rest)=l.split(' ',1) |
55 if k=='WARC-Target-URI:': | 110 if k=='WARC-Target-URI:': |
56 uri=rest.rstrip() | 111 uri=URI.uu[rest.rstrip()] # better be there... |
57 elif k=='WARC-Truncated:': | 112 elif k=='WARC-Truncated:': |
58 trunc[uri]=rest.rstrip() | 113 truncs+=1 |
59 bu=list(map(str.rstrip,buf.readlines())) | 114 uri.trunc=rest.rstrip() |
60 | 115 |
61 fails={} | 116 fails=0 |
62 for l in log: | 117 for l in log: |
63 r=FAIL.match(l) | 118 r=FAIL.match(l) |
64 if r: | 119 if r: |
120 fails+=1 | |
65 (u,m2)=r.groups() | 121 (u,m2)=r.groups() |
66 fails[u]=m2 | 122 URI.get(u).fail=m2 |
123 | |
124 print(len(URI.uu),file=sys.stderr) | |
67 | 125 |
68 print("""For %s/%s: | 126 print("""For %s/%s: |
69 %4s requested | 127 %4s requested |
70 %4s retrieved | 128 %4s retrieved |
129 %4s diagnosed | |
130 %4s failed | |
71 %4s truncated | 131 %4s truncated |
72 %4s diagnosed | 132 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), |
73 %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)), | |
74 file=sys.stderr) | 133 file=sys.stderr) |
75 | 134 |
76 for u in bu: | 135 for u in URI.uu.values(): |
77 sig=0 | 136 print(u.typeString()) |
78 sig+=8 if u in fetches else 0 | |
79 sig+=4 if u in diags else 0 | |
80 sig+=2 if u in fails else 0 | |
81 sig+=1 if u in trunc else 0 | |
82 print(format(sig,'04b')) |