Mercurial > hg > cc > cirrus_home
annotate bin/track.py @ 187:9805323d9969
add lastmod to cdx lines,
start writing test case
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 23 Sep 2024 16:35:22 +0100 |
parents | fcb390b3ea55 |
children |
rev | line source |
---|---|
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/lustre/sw/miniconda3/bin/python3 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Track a list of URIs through nutch results''' |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 # Usage: track.py year-nn segmentid [file] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
5 import re,sys,glob,gzip,json,urllib.parse |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
7 CDX=re.compile("(.*)\)(.*) (.*) (\{.*\})$") |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
8 FAIL1a=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>Http code=[^,]*)") |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
9 FAIL1b=re.compile("....................... INFO fetcher\\.FetcherThread - FetcherThread [0-9]* fetch of (?P<u>.*) failed with: (?P<m>[^ ]*):") |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
10 FAIL2=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Crawl-Delay) for (?P<u>.*) too long") |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
11 FAIL3=re.compile("....................... INFO fetcher\\.FetcherThread - (?P<m>Denied) by robots.txt: (?P<u>.*)$") |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 cc="/beegfs/common_crawl/CC-MAIN-%s"%sys.argv[1] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 seg=sys.argv[2] |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 if len(sys.argv)==4: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 uuf=open(sys.argv[3]) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
17 else: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
18 uuf=sys.stdin |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
19 wwfp="%s/%s/cdx/warc/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
20 ddfp="%s/%s/cdx/crawldiagnostics/CC*.gz"%(cc,seg) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
21 wwff=glob.glob(wwfp) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
22 ddff=glob.glob(ddfp) |
77 | 23 wtf=open("%s/%s/warc/warc/truncated.txt"%(cc,seg)) |
24 buf=open("%s/%s/bu.txt"%(cc,seg)) | |
25 log=open("%s/%s/hadoop.log"%(cc,seg)) | |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
26 assert len(wwff)!=0,wwfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
27 assert len(ddff)!=0,ddfp |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 print("Usage: track.py year-nn segmentid [file]",file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 raise |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
32 BU={} |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
33 U={} |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
34 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
35 class URI: |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
36 uu={} |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
37 depth=0 |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
38 def __init__(self,s,seed=False): |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
39 self.s=s |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
40 self.d_props=None # from cdx/crawldiagnostics |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
41 self.w_props=None # from cdx/warc |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
42 self.trunc=self.fail=None |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
43 self._host=self._path=self._scheme=None |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
44 self.seed=seed |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
45 self.uu[s]=self |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
46 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
47 @property |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
48 def host(self): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
49 if self._host is None: |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
50 (self._scheme,self._host,self._path |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
51 ,_,_,_)=urllib.parse.urlparse(self.s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
52 return self._host |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
53 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
54 @property |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
55 def path(self): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
56 if self._path is None: |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
57 (self._scheme,self._host,self._path, |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
58 _,_,_)=urllib.parse.urlparse(self.s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
59 return self._path |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
60 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
61 @property |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
62 def scheme(self): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
63 if self._scheme is None: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
64 (self._scheme,self._host,self._path, |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
65 _,_,_)=urllib.parse.urlparse(self.s) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
66 return self._scheme |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
67 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
68 @classmethod |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
69 def get(cls,s): |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
70 try: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
71 return cls.uu[s] |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
72 except KeyError: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
73 return cls(s) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
74 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
75 def __repr__(self): |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
76 prefix="%s://%s%s"%(self.scheme,self.host,self.path[:20]) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
77 plen=len(prefix) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
78 suffix=('' if len(self.s)<plen else |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
79 '...'+self.s[-min(len(self.s)-plen,10):]) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
80 return "<U%s>%s[%s%s]"%(self.typeString(), |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
81 (lambda x:'' if x=='2' else x)(self.status[0]), |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
82 prefix,suffix) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
83 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
84 def typeString(self): |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
85 return "%s%s%s%s%s"%('s' if self.seed else '', |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
86 'r' if self.w_props is not None else ( |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
87 ('d'+self.status[0]) |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
88 if self.d_props is not None else ''), |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
89 ('l'+'.'.join(str(s[1]) for s in |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
90 sorted(self.sources,key=lambda x:x[1]))) |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
91 if hasattr(self,'sources') else '', |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
92 '' if self.trunc is None else self.trunc[0], |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
93 '' if self.fail is None else ( |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
94 'F'+(lambda x:x[10] if x[0]=='H' else x[0])(self.fail))) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
95 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
96 def readCDX(files,where,status=None): |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
97 c=None |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
98 res={} |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
99 # Ref. https://github.com/ikreymer/webarchive-indexing |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
100 for resFileName in files: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
101 with gzip.open(resFileName,mode='rt',encoding='utf-8') as rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
102 n=0 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
103 try: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
104 for c in rf: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
105 r=CDX.match(c) |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
106 (rdom,path,seg,props)=r.groups() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
107 d=json.loads(props) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
108 uri=d["url"] |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
109 u=URI.get(uri) |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
110 u.__dict__[where]=d |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
111 u.rdomstr=rdom # domain, reverse order, comma separated |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
112 u.lcpath=path # path, lower-cased, maybe %decoded? |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
113 u.seg=seg # partial warc identifier? |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
114 if status is not None: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
115 u.status=status |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
116 res[uri]=u |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
117 n+=1 |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
118 except: |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
119 print(resFileName,n,c,file=sys.stderr) |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
120 raise |
77 | 121 #print (n,len(res),file=sys.stderr) |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
122 return res |
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
123 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
124 seeds=0 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
125 for l in buf.readlines(): |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
126 seeds+=1 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
127 u=URI(l.rstrip(),True) |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
128 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
129 print('post-seed',seeds,file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
130 sys.stderr.flush() |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
131 fetches=readCDX(wwff,'w_props',"200") |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
132 print('post-fetch',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
133 sys.stderr.flush() |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
134 diags=readCDX(ddff,'d_props') |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
135 print('post-diag',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
136 sys.stderr.flush() |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
137 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
138 BORKED="borked" |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
139 |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
140 def maybeTrack(u,source=None,depth=0): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
141 if not hasattr(u,'status'): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
142 u.status='unknown' if u.d_props is None else u.d_props["status"] |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
143 if source is not None: |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
144 bptr=(source,depth) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
145 if hasattr(u,'sources'): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
146 u.sources.append(bptr) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
147 else: |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
148 u.sources=[bptr] |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
149 if u.status[0]=='3': |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
150 try: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
151 loc=u.d_props["redirect"] |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
152 r=URI.get(loc) |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
153 u.reloc=r |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
154 maybeTrack(r,source=u,depth=depth+1) |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
155 except KeyError: |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
156 u.reloc=BORKED # something went wrong somewhere... |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
157 |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
158 for u in diags.values(): |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
159 maybeTrack(u) |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
160 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
161 truncs=0 |
77 | 162 for l in wtf: |
163 if l.startswith('WARC-'): | |
164 (k,rest)=l.split(' ',1) | |
165 if k=='WARC-Target-URI:': | |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
166 uri=URI.uu[rest.rstrip()] # better be there... |
77 | 167 elif k=='WARC-Truncated:': |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
168 truncs+=1 |
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
169 uri.trunc=rest.rstrip() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
170 |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
171 fails=0 |
77 | 172 for l in log: |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
173 for p in (FAIL1a,FAIL1b,FAIL2,FAIL3): |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
174 r=p.match(l) |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
175 if r: |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
176 fails+=1 |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
177 u=r.group('u') |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
178 m=r.group('m') |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
179 URI.get(u).fail=m |
78
846b38f8b204
refactor, change summary print (problem?)
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
77
diff
changeset
|
180 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
181 print('post-fail',len(URI.uu),file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
182 sys.stderr.flush() |
76
6cf3dc7ff022
starting on tool to assemble as complete as we have info wrt a seed URI
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
183 |
77 | 184 print("""For %s/%s: |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
185 %4s requested s |
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
186 %4s retrieved r |
81
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
187 %4s diagnosed d |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
188 %4s redirection-location l |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
189 %4s failed F{j for java Exception, 1-5 for Http code=, C for robot crawl delay |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
190 D for robot denied} |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
191 %4s truncated r{d for disconnect, t for timeout} |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
192 """%(cc,seg,seeds,len(fetches),len(diags), |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
193 sum(1 for u in URI.uu.values() if hasattr(u,'sources')), |
fcb390b3ea55
improved F handling/logging
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
80
diff
changeset
|
194 fails,truncs), |
77 | 195 file=sys.stderr) |
80
f494df0d34aa
keep separate antecedants separate, buggy?
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
79
diff
changeset
|
196 sys.stderr.flush() |
77 | 197 |
79
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
198 if not sys.stdout.isatty(): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
199 for u in URI.uu.values(): |
4b8e4e3d60eb
track redirects, need to us full crawldiagnostics.warc.gz for "location:" and "Uri:"
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
78
diff
changeset
|
200 print(u.typeString()) |