annotate bin/track.py @ 80:f494df0d34aa

keep separate antecedants separate, buggy?
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 08 May 2020 19:52:36 +0100
parents 4b8e4e3d60eb
children fcb390b3ea55
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/lustre/sw/miniconda3/bin/python3
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Track a list of URIs through nutch results'''
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Usage: track.py year-nn segmentid [file]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
5 import re,sys,glob,gzip,json,urllib.parse
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$")
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)")
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 seg=sys.argv[2]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 if len(sys.argv)==4:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 uuf=open(sys.argv[3])
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 else:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 uuf=sys.stdin
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 wwff=glob.glob(wwfp)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 ddff=glob.glob(ddfp)
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
20 wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg))
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
21 buf=open("%s/%s/bu.txt"%(cc,seg))
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
22 log=open("%s/%s/hadoop.log"%(cc,seg))
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 assert len(wwff)!=0,wwfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24 assert len(ddff)!=0,ddfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 raise
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
29 BU={}
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
30 U={}
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
31
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
32 class URI:
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
33 uu={}
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
34 depth=0
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
35 def __init__(self,s,seed=False):
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
36 self.s=s
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
37 self.d_props=None # from cdx/crawldiagnostics
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
38 self.w_props=None # from cdx/warc
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
39 self.trunc=self.fail=None
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
40 self._host=self._path=self._scheme=None
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
41 self.seed=seed
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
42 self.uu[s]=self
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
43
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
44 @property
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
45 def host(self):
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
46 if self._host is None:
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
47 (self._scheme,self._host,self._path
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
48 ,_,_,_)=urllib.parse.urlparse(self.s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
49 return self._host
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
50
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
51 @property
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
52 def path(self):
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
53 if self._path is None:
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
54 (self._scheme,self._host,self._path,
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
55 _,_,_)=urllib.parse.urlparse(self.s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
56 return self._path
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
57
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
58 @property
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
59 def scheme(self):
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
60 if self._scheme is None:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
61 (self._scheme,self._host,self._path,
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
62 _,_,_)=urllib.parse.urlparse(self.s)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
63 return self._scheme
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
64
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
65 @classmethod
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
66 def get(cls,s):
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
67 try:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
68 return cls.uu[s]
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
69 except KeyError:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
70 return cls(s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
71
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
72 def __repr__(self):
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
73 prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20])
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
74 plen=len(prefix)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
75 suffix=('' if len(self.s)<plen else
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
76 '...'+self.s[-min(len(self.s)-plen,10):])
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
77 return "<U%s>%s[%s%s]"%(self.typeString(),
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
78 (lambda x:'' if x=='2' else x)(self.status[0]),
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
79 prefix,suffix)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
80
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
81 def typeString(self):
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
82 return "%s%s%s%s"%('s' if self.seed else '',
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
83 'r' if self.w_props is not None else (
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
84 '' if self.d_props is None else (
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
85 ('l'+'.'.join(str(s[1]) for s in
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
86 sorted(self.sources,key=lambda x:x[1])))
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
87 if hasattr(self,'sources') else 'd'+self.status[0])),
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
88 '' if self.trunc is None else self.trunc[0],
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
89 '' if self.fail is None else 'F')
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
90
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
91 def readCDX(files,where,status=None):
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
92 c=None
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93 res={}
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 # Ref. https://github.com/ikreymer/webarchive-indexing
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95 for resFileName in files:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 n=0
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 for c in rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 r=CDX.match(c)
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
101 (rdom,path,seg,props)=r.groups()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 d=json.loads(props)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
103 uri=d["url"]
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
104 u=URI.get(uri)
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
105 u.__dict__[where]=d
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
106 u.rdomstr=rdom # domain, reverse order, comma separated
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
107 u.lcpath=path # path, lower-cased, maybe %decoded?
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
108 u.seg=seg # partial warc identifier?
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
109 if status is not None:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
110 u.status=status
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
111 res[uri]=u
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 n+=1
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
114 print(resFileName,n,c,file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
115 raise
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
116 #print (n,len(res),file=sys.stderr)
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 return res
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
119 seeds=0
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
120 for l in buf.readlines():
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
121 seeds+=1
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
122 u=URI(l.rstrip(),True)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
123
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
124 print('post-seed',seeds,file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
125 sys.stderr.flush()
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
126 fetches=readCDX(wwff,'w_props',"200")
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
127 print('post-fetch',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
128 sys.stderr.flush()
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
129 diags=readCDX(ddff,'d_props')
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
130 print('post-diag',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
131 sys.stderr.flush()
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
132
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
133 BORKED="borked"
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
134
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
135 def maybeTrack(u,source=None,depth=0):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
136 if not hasattr(u,'status'):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
137 u.status='unknown' if u.d_props is None else u.d_props["status"]
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
138 if source is not None:
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
139 bptr=(source,depth+1)
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
140 if hasattr(u,'sources'):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
141 u.sources.append(bptr)
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
142 else:
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
143 u.sources=[bptr]
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
144 if u.status[0]=='3':
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
145 try:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
146 loc=u.d_props["redirect"]
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
147 r=URI.get(loc)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
148 u.reloc=r
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
149 maybeTrack(r,source=u,depth=depth+1)
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
150 except KeyError:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
151 u.reloc=BORKED # something went wrong somewhere...
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
152
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
153 for u in diags.values():
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
154 maybeTrack(u)
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
155
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
156 truncs=0
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
157 for l in wtf:
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
158 if l.startswith('WARC-'):
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
159 (k,rest)=l.split(' ',1)
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
160 if k=='WARC-Target-URI:':
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
161 uri=URI.uu[rest.rstrip()] # better be there...
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
162 elif k=='WARC-Truncated:':
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
163 truncs+=1
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
164 uri.trunc=rest.rstrip()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
165
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
166 fails=0
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
167 for l in log:
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
168 r=FAIL.match(l)
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
169 if r:
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
170 fails+=1
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
171 (u,m2)=r.groups()
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
172 URI.get(u).fail=m2
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
173
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
174 print('post-fail',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
175 sys.stderr.flush()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
176
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
177 print("""For %s/%s:
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
178 %4s requested s
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
179 %4s retrieved r
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
180 %4s diagnosed d/l
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
181 %4s failed F
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
182 %4s truncated rd/rt
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs),
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
184 file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
185 sys.stderr.flush()
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
186
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
187 if not sys.stdout.isatty():
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
188 for u in URI.uu.values():
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
189 print(u.typeString())