annotate bin/track.py @ 93:4d870a7ec871

support a command to receive each result, remove use of X-Crawler-Content-Length
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 15 Apr 2021 10:59:25 +0000
parents fcb390b3ea55
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/lustre/sw/miniconda3/bin/python3
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Track a list of URIs through nutch results'''
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 # Usage: track.py year-nn segmentid [file]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
5 import re,sys,glob,gzip,json,urllib.parse
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$")
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
8 FAIL1a=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>Http code=[^,]*)")
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
9 FAIL1b=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>[^ ]*):")
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
10 FAIL2=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Crawl-Delay) for (?P<u>.*) too long")
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
11 FAIL3=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Denied) by robots.txt: (?P<u>.*)$")
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 seg=sys.argv[2]
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 if len(sys.argv)==4:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 uuf=open(sys.argv[3])
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 else:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 uuf=sys.stdin
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 wwff=glob.glob(wwfp)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 ddff=glob.glob(ddfp)
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
23 wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg))
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
24 buf=open("%s/%s/bu.txt"%(cc,seg))
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
25 log=open("%s/%s/hadoop.log"%(cc,seg))
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 assert len(wwff)!=0,wwfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 assert len(ddff)!=0,ddfp
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 raise
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
32 BU={}
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
33 U={}
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
34
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
35 class URI:
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
36 uu={}
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
37 depth=0
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
38 def __init__(self,s,seed=False):
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
39 self.s=s
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
40 self.d_props=None # from cdx/crawldiagnostics
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
41 self.w_props=None # from cdx/warc
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
42 self.trunc=self.fail=None
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
43 self._host=self._path=self._scheme=None
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
44 self.seed=seed
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
45 self.uu[s]=self
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
46
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
47 @property
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
48 def host(self):
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
49 if self._host is None:
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
50 (self._scheme,self._host,self._path
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
51 ,_,_,_)=urllib.parse.urlparse(self.s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
52 return self._host
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
53
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
54 @property
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
55 def path(self):
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
56 if self._path is None:
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
57 (self._scheme,self._host,self._path,
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
58 _,_,_)=urllib.parse.urlparse(self.s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
59 return self._path
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
60
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
61 @property
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
62 def scheme(self):
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
63 if self._scheme is None:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
64 (self._scheme,self._host,self._path,
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
65 _,_,_)=urllib.parse.urlparse(self.s)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
66 return self._scheme
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
67
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
68 @classmethod
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
69 def get(cls,s):
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
70 try:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
71 return cls.uu[s]
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
72 except KeyError:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
73 return cls(s)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
74
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
75 def __repr__(self):
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
76 prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20])
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
77 plen=len(prefix)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
78 suffix=('' if len(self.s)<plen else
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
79 '...'+self.s[-min(len(self.s)-plen,10):])
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
80 return "<U%s>%s[%s%s]"%(self.typeString(),
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
81 (lambda x:'' if x=='2' else x)(self.status[0]),
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
82 prefix,suffix)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
83
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
84 def typeString(self):
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
85 return "%s%s%s%s%s"%('s' if self.seed else '',
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
86 'r' if self.w_props is not None else (
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
87 ('d'+self.status[0])
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
88 if self.d_props is not None else ''),
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
89 ('l'+'.'.join(str(s[1]) for s in
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
90 sorted(self.sources,key=lambda x:x[1])))
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
91 if hasattr(self,'sources') else '',
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
92 '' if self.trunc is None else self.trunc[0],
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
93 '' if self.fail is None else (
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
94 'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail)))
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
95
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
96 def readCDX(files,where,status=None):
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
97 c=None
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 res={}
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 # Ref. https://github.com/ikreymer/webarchive-indexing
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 for resFileName in files:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102 n=0
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103 try:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
104 for c in rf:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
105 r=CDX.match(c)
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
106 (rdom,path,seg,props)=r.groups()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 d=json.loads(props)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
108 uri=d["url"]
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
109 u=URI.get(uri)
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
110 u.__dict__[where]=d
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
111 u.rdomstr=rdom # domain, reverse order, comma separated
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
112 u.lcpath=path # path, lower-cased, maybe %decoded?
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
113 u.seg=seg # partial warc identifier?
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
114 if status is not None:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
115 u.status=status
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
116 res[uri]=u
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 n+=1
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 except:
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
119 print(resFileName,n,c,file=sys.stderr)
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
120 raise
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
121 #print (n,len(res),file=sys.stderr)
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
122 return res
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
123
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
124 seeds=0
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
125 for l in buf.readlines():
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
126 seeds+=1
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
127 u=URI(l.rstrip(),True)
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
128
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
129 print('post-seed',seeds,file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
130 sys.stderr.flush()
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
131 fetches=readCDX(wwff,'w_props',"200")
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
132 print('post-fetch',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
133 sys.stderr.flush()
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
134 diags=readCDX(ddff,'d_props')
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
135 print('post-diag',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
136 sys.stderr.flush()
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
137
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
138 BORKED="borked"
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
139
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
140 def maybeTrack(u,source=None,depth=0):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
141 if not hasattr(u,'status'):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
142 u.status='unknown' if u.d_props is None else u.d_props["status"]
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
143 if source is not None:
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
144 bptr=(source,depth)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
145 if hasattr(u,'sources'):
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
146 u.sources.append(bptr)
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
147 else:
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
148 u.sources=[bptr]
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
149 if u.status[0]=='3':
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
150 try:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
151 loc=u.d_props["redirect"]
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
152 r=URI.get(loc)
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
153 u.reloc=r
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
154 maybeTrack(r,source=u,depth=depth+1)
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
155 except KeyError:
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
156 u.reloc=BORKED # something went wrong somewhere...
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
157
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
158 for u in diags.values():
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
159 maybeTrack(u)
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
160
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
161 truncs=0
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
162 for l in wtf:
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
163 if l.startswith('WARC-'):
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
164 (k,rest)=l.split(' ',1)
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
165 if k=='WARC-Target-URI:':
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
166 uri=URI.uu[rest.rstrip()] # better be there...
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
167 elif k=='WARC-Truncated:':
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
168 truncs+=1
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
169 uri.trunc=rest.rstrip()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
170
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
171 fails=0
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
172 for l in log:
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
173 for p in (FAIL1a,FAIL1b,FAIL2,FAIL3):
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
174 r=p.match(l)
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
175 if r:
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
176 fails+=1
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
177 u=r.group('u')
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
178 m=r.group('m')
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
179 URI.get(u).fail=m
78
846b38f8b204 refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 77
diff changeset
180
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
181 print('post-fail',len(URI.uu),file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
182 sys.stderr.flush()
76
6cf3dc7ff022 starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
183
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
184 print("""For %s/%s:
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
185 %4s requested s
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
186 %4s retrieved r
81
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
187 %4s diagnosed d
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
188 %4s redirection-location l
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
189 %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
190 D for robot denied}
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
191 %4s truncated r{d for disconnect, t for timeout}
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
192 """%(cc,seg,seeds,len(fetches),len(diags),
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
193 sum(1 for u in URI.uu.values() if hasattr(u,'sources')),
fcb390b3ea55 improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 80
diff changeset
194 fails,truncs),
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
195 file=sys.stderr)
80
f494df0d34aa keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 79
diff changeset
196 sys.stderr.flush()
77
bfff01c139ea bare framework working
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 76
diff changeset
197
79
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
198 if not sys.stdout.isatty():
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
199 for u in URI.uu.values():
4b8e4e3d60eb track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 78
diff changeset
200 print(u.typeString())