Mercurial > hg > cc > cirrus_home
annotate bin/track.py @ 80:f494df0d34aa
keep separate antecedants separate, buggy?
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 08 May 2020 19:52:36 +0100 |
parents | 4b8e4e3d60eb |
children | fcb390b3ea55 |
rev | line source |
---|---|
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/lustre/sw/miniconda3/bin/python3 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Track a list of URIs through nutch results''' |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Usage: track.py year-nn segmentid [file] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
5 import re,sys,glob,gzip,json,urllib.parse |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") |
77 | 8 FAIL=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (.*) failed with: ([^ ]* [^ ]*)") |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 seg=sys.argv[2] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 if len(sys.argv)==4: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 uuf=open(sys.argv[3]) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 else: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 uuf=sys.stdin |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 wwff=glob.glob(wwfp) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 ddff=glob.glob(ddfp) |
77 | 20 wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) |
21 buf=open("%s/%s/bu.txt"%(cc,seg)) | |
22 log=open("%s/%s/hadoop.log"%(cc,seg)) | |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
23 assert len(wwff)!=0,wwfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
24 assert len(ddff)!=0,ddfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
25 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 raise |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
29 BU={} |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
30 U={} |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
31 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
32 class URI: |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
33 uu={} |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
34 depth=0 |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
35 def __init__(self,s,seed=False): |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
36 self.s=s |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
37 self.d_props=None # from cdx/crawldiagnostics |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
38 self.w_props=None # from cdx/warc |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
39 self.trunc=self.fail=None |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
40 self._host=self._path=self._scheme=None |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
41 self.seed=seed |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
42 self.uu[s]=self |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
43 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
44 @property |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
45 def host(self): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
46 if self._host is None: |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
47 (self._scheme,self._host,self._path |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
48 ,_,_,_)=urllib.parse.urlparse(self.s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
49 return self._host |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
50 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
51 @property |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
52 def path(self): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
53 if self._path is None: |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
54 (self._scheme,self._host,self._path, |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
55 _,_,_)=urllib.parse.urlparse(self.s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
56 return self._path |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
57 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
58 @property |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
59 def scheme(self): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
60 if self._scheme is None: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
61 (self._scheme,self._host,self._path, |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
62 _,_,_)=urllib.parse.urlparse(self.s) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
63 return self._scheme |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
64 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
65 @classmethod |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
66 def get(cls,s): |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
67 try: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
68 return cls.uu[s] |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
69 except KeyError: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
70 return cls(s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
71 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
72 def __repr__(self): |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
73 prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20]) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
74 plen=len(prefix) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
75 suffix=('' if len(self.s)<plen else |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
76 '...'+self.s[-min(len(self.s)-plen,10):]) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
77 return "<U%s>%s[%s%s]"%(self.typeString(), |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
78 (lambda x:'' if x=='2' else x)(self.status[0]), |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
79 prefix,suffix) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
80 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
81 def typeString(self): |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
82 return "%s%s%s%s"%('s' if self.seed else '', |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
83 'r' if self.w_props is not None else ( |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
84 '' if self.d_props is None else ( |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
85 ('l'+'.'.join(str(s[1]) for s in |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
86 sorted(self.sources,key=lambda x:x[1]))) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
87 if hasattr(self,'sources') else 'd'+self.status[0])), |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
88 '' if self.trunc is None else self.trunc[0], |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
89 '' if self.fail is None else 'F') |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
90 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
91 def readCDX(files,where,status=None): |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
92 c=None |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
93 res={} |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
94 # Ref. https://github.com/ikreymer/webarchive-indexing |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
95 for resFileName in files: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
96 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
97 n=0 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 for c in rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 r=CDX.match(c) |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
101 (rdom,path,seg,props)=r.groups() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 d=json.loads(props) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
103 uri=d["url"] |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
104 u=URI.get(uri) |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
105 u.__dict__[where]=d |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
106 u.rdomstr=rdom # domain, reverse order, comma separated |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
107 u.lcpath=path # path, lower-cased, maybe %decoded? |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
108 u.seg=seg # partial warc identifier? |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
109 if status is not None: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
110 u.status=status |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
111 res[uri]=u |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
112 n+=1 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
113 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
114 print(resFileName,n,c,file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
115 raise |
77 | 116 #print (n,len(res),file=sys.stderr) |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 return res |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
119 seeds=0 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
120 for l in buf.readlines(): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
121 seeds+=1 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
122 u=URI(l.rstrip(),True) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
123 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
124 print('post-seed',seeds,file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
125 sys.stderr.flush() |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
126 fetches=readCDX(wwff,'w_props',"200") |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
127 print('post-fetch',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
128 sys.stderr.flush() |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
129 diags=readCDX(ddff,'d_props') |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
130 print('post-diag',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
131 sys.stderr.flush() |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
132 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
133 BORKED="borked" |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
134 |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
135 def maybeTrack(u,source=None,depth=0): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
136 if not hasattr(u,'status'): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
137 u.status='unknown' if u.d_props is None else u.d_props["status"] |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
138 if source is not None: |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
139 bptr=(source,depth+1) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
140 if hasattr(u,'sources'): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
141 u.sources.append(bptr) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
142 else: |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
143 u.sources=[bptr] |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
144 if u.status[0]=='3': |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
145 try: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
146 loc=u.d_props["redirect"] |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
147 r=URI.get(loc) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
148 u.reloc=r |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
149 maybeTrack(r,source=u,depth=depth+1) |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
150 except KeyError: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
151 u.reloc=BORKED # something went wrong somewhere... |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
152 |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
153 for u in diags.values(): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
154 maybeTrack(u) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
155 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
156 truncs=0 |
77 | 157 for l in wtf: |
158 if l.startswith('WARC-'): | |
159 (k,rest)=l.split(' ',1) | |
160 if k=='WARC-Target-URI:': | |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
161 uri=URI.uu[rest.rstrip()] # better be there... |
77 | 162 elif k=='WARC-Truncated:': |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
163 truncs+=1 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
164 uri.trunc=rest.rstrip() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
165 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
166 fails=0 |
77 | 167 for l in log: |
168 r=FAIL.match(l) | |
169 if r: | |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
170 fails+=1 |
77 | 171 (u,m2)=r.groups() |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
172 URI.get(u).fail=m2 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
173 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
174 print('post-fail',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
175 sys.stderr.flush() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
176 |
77 | 177 print("""For %s/%s: |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
178 %4s requested s |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
179 %4s retrieved r |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
180 %4s diagnosed d/l |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
181 %4s failed F |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
182 %4s truncated rd/rt |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
183 """%(cc,seg,seeds,len(fetches),len(diags),fails,truncs), |
77 | 184 file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
185 sys.stderr.flush() |
77 | 186 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
187 if not sys.stdout.isatty(): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
188 for u in URI.uu.values(): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
189 print(u.typeString()) |