comparison bin/track.py @ 78:846b38f8b204

refactor, change summary print (problem?)
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 07 May 2020 11:33:24 +0100
parents bfff01c139ea
children 4b8e4e3d60eb
comparison
equal deleted inserted replaced
77:bfff01c139ea 78:846b38f8b204
1 #!/lustre/sw/miniconda3/bin/python3 1 #!/lustre/sw/miniconda3/bin/python3
2 '''Track a list of URIs through nutch results''' 2 '''Track a list of URIs through nutch results'''
3 # Usage: track.py year-nn segmentid [file] 3 # Usage: track.py year-nn segmentid [file]
4 4
5 import re,sys,glob,gzip,json 5 import re,sys,glob,gzip,json,urllib.parse
6 6
7 CDX=re.compile("(.*)\)(.*) (\{.*\})$") 7 CDX=re.compile("(.*)\)(.*) (\{.*\})$")
8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") 8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] 9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
10 seg=sys.argv[2] 10 seg=sys.argv[2]
24 assert len(ddff)!=0,ddfp 24 assert len(ddff)!=0,ddfp
25 except: 25 except:
26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) 26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
27 raise 27 raise
28 28
29 def readCDX(files): 29 BU={}
30 U={}
31
32 class URI:
33 uu={}
34 def __init__(self,s):
35 self.s=s
36 self.d_props=None # from cdx/crawldiagnostics
37 self.w_props=None # from cdx/warc
38 self.trunc=self.fail=None
39 self._host=self._path=None
40 self.seed=False
41 self.uu[s]=self
42
43 @property
44 def host(self):
45 if self._host is None:
46 (_,self._host,self._path,_,_,_)=urllib.urlparse.parse(self.s)
47 return self._host
48
49 @property
50 def path(self):
51 if self._path is None:
52 (_,self._host,self._path,_,_,_)=urllib.parse(self.s)
53 return self._path
54
55 @classmethod
56 def get(cls,s):
57 return cls.uu.get(s,cls(s))
58
59 def __repr__(self):
60 return "<U%s>[%s/%s]"%(self.typeString(),self.host,self.path)
61
62 def typeString(self):
63 return "%s%s%s"%('s' if u.seed else '',
64 'r' if self.w_props is not None else (
65 'd' if self.w_props is not None else (
66 'f' if self.fail is not None else '')),
67 '' if self.trunc is None else self.trunc[0])
68
69 def readCDX(files,where):
70 c=None
30 res={} 71 res={}
31 c=None
32 # Ref. https://github.com/ikreymer/webarchive-indexing 72 # Ref. https://github.com/ikreymer/webarchive-indexing
33 for resFileName in files: 73 for resFileName in files:
34 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: 74 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
35 n=0 75 n=0
36 try: 76 try:
37 for c in rf: 77 for c in rf:
38 r=CDX.match(c) 78 r=CDX.match(c)
39 (dom,path,props)=r.groups() 79 (host,path,props)=r.groups()
40 d=json.loads(props) 80 d=json.loads(props)
41 res[d["url"]]=d 81 uri=d["url"]
82 u=URI.get(uri)
83 u.__dict__[where]=d
84 u._host=host
85 u._path=path
86 res[uri]=u
42 n+=1 87 n+=1
43 except: 88 except:
44 print(resFileName,n,c,file=sys.stderr) 89 print(resFileName,n,c,file=sys.stderr)
45 raise 90 raise
46 #print (n,len(res),file=sys.stderr) 91 #print (n,len(res),file=sys.stderr)
47 return res 92 return res
48 93
49 fetches=readCDX(wwff) 94 seeds=0
50 diags=readCDX(ddff) 95 for l in buf.readlines():
51 trunc={} 96 seeds+=1
97 u=URI(l.rstrip())
98 u.seed=True
99
100 print('post-seed',len(URI.uu),file=sys.stderr)
101 fetches=readCDX(wwff,'w_props')
102 print('post-fetch',len(URI.uu),file=sys.stderr)
103 diags=readCDX(ddff,'d_props')
104 print('post-diag',len(URI.uu),file=sys.stderr)
105
106 truncs=0
52 for l in wtf: 107 for l in wtf:
53 if l.startswith('WARC-'): 108 if l.startswith('WARC-'):
54 (k,rest)=l.split(' ',1) 109 (k,rest)=l.split(' ',1)
55 if k=='WARC-Target-URI:': 110 if k=='WARC-Target-URI:':
56 uri=rest.rstrip() 111 uri=URI.uu[rest.rstrip()] # better be there...
57 elif k=='WARC-Truncated:': 112 elif k=='WARC-Truncated:':
58 trunc[uri]=rest.rstrip() 113 truncs+=1
59 bu=list(map(str.rstrip,buf.readlines())) 114 uri.trunc=rest.rstrip()
60 115
61 fails={} 116 fails=0
62 for l in log: 117 for l in log:
63 r=FAIL.match(l) 118 r=FAIL.match(l)
64 if r: 119 if r:
120 fails+=1
65 (u,m2)=r.groups() 121 (u,m2)=r.groups()
66 fails[u]=m2 122 URI.get(u).fail=m2
123
124 print(len(URI.uu),file=sys.stderr)
67 125
68 print("""For %s/%s: 126 print("""For %s/%s:
69 %4s requested 127 %4s requested
70 %4s retrieved 128 %4s retrieved
129 %4s diagnosed
130 %4s failed
71 %4s truncated 131 %4s truncated
72 %4s diagnosed 132 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
73 %4s failed"""%(cc,seg,len(bu),len(fetches),len(trunc),len(diags),len(fails)),
74 file=sys.stderr) 133 file=sys.stderr)
75 134
76 for u in bu: 135 for u in URI.uu.values():
77 sig=0 136 print(u.typeString())
78 sig+=8 if u in fetches else 0
79 sig+=4 if u in diags else 0
80 sig+=2 if u in fails else 0
81 sig+=1 if u in trunc else 0
82 print(format(sig,'04b'))