Mercurial > hg > cc > cirrus_work
annotate bin/merge_date.py @ 115:0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
get rid of DFQ and xq,
big simplification and refactor as a result,
fix bug in date stream eof handling
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 26 Sep 2023 17:42:57 +0100 |
parents | 4a52585a1aac |
children | f52783faf3ee |
rev | line source |
---|---|
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 that year's index |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Usage: merge_date.py ksvstream cdx-dir outdir |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
7 ksvstream consists of tab-separated key, CC date, url and Unix timestamp |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ''' # ' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
10 import sys, io, os, os.path, time, re |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from isal import igzip |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
13 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
14 DEBUG = 0 |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
15 while sys.argv[1] == '-d': |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
16 sys.argv.pop(1) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
17 DEBUG += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
18 |
90 | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
21 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
23 b'(crawldiagnostics|robotstxt)/') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
25 b'phpsessid|sid|jsessionid|aspsessionid[a-z]*)' |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
26 b'=[^&]*)') |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
27 ISESSION = re.compile(SESSION.pattern,flags=re.I) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
28 URL=re.compile(b'\{"url": "([^"]*)"') |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
29 WARC=re.compile(b' \{[^}]*"filename": "([^/]*/){4}warc/') |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
30 |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
31 # Above based on this from broken Java code: |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
32 # https://github.com/iipc/webarchive-commons/commit/5cfff50a03263208520ca2d260229eefb2aec2f7 |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
33 #(.+)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
34 #(.+)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
35 #(.+)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
36 #(.+)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
37 #(.+)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |
90 | 39 #print(sys.argv[3],NPATH,file=sys.stderr) |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
41 os.makedirs(sys.argv[3], exist_ok=True) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 |
90 | 43 FN = 0 |
44 | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
45 XCNT = 0 |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
46 DCNT = 0 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
47 |
90 | 48 XF = igzip.IGzipFile(filename=XPATH%0) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
49 NF = open(NN:=(NPATH%0),'wb') |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
51 def nextLine(): |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
52 '''Move on to next index file if current has run out''' |
93
25bd398a8035
improve reordering, still failing on cdx-00004
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
92
diff
changeset
|
53 global FN, NF, NPATH, NN, XF, XPATH, XCNT, DCNT |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
54 while True: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
55 xl=XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
56 XCNT += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
57 if xl == b'': |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
58 # need to move to next index file |
90 | 59 FN += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
60 XF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
61 NF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
62 print(NN, flush=True) # so we can compress it |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
63 time.sleep(0.1) # so it flushes? |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
64 XN=XPATH%FN |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
65 if not os.path.exists(XN): |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
66 return None |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
67 XF = igzip.IGzipFile(filename=XN) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
68 NF = open((NN:=NPATH%FN), 'wb') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
69 xl = XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
70 XCNT = 1 |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
71 if WARC.search(xl): |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
72 return xl |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
73 else: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
74 NF.write(xl) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
75 if DEBUG: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
76 sys.stderr.write("out_rc\n") |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
77 |
92
e56a7aad9ce9
attempt at reordering if necessary
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
91
diff
changeset
|
78 |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
79 def nextDate(df,dn): |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
80 global DEBUG, DCNT, XCNT |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
81 dl = df.readline() |
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
82 if dl == b'': |
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
83 # write out the last of the last index file, if any |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
84 return "", "", "", 0 |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
85 if DEBUG: |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
86 sys.stderr.write("dl%s: %s\n"%(dn,dl)) |
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
87 dkey, ddate, durl, dtime = dl.split(b'\t') |
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
88 DCNT += 1 |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
89 return dkey, ddate, durl, dtime |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
90 |
90 | 91 with open(sys.argv[1], 'rb') as df: |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
92 DCNT = 0 |
90 | 93 |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
94 dkey, ddate, durl, dtime = nextDate(df,1) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
95 |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
96 while (xl := nextLine())[0] is not None: |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
97 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
98 m = URL.match(xprops) |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
99 if m: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
100 xurl = m[1] |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
101 else: |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
102 raise ValueError("No url in %s"%xprops) |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
103 if DEBUG: |
100
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
104 sys.stderr.write("xl: %s\n"%(' '.join(xp.decode('ascii') |
18446a7eeb9e
rework handling of session key problem
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
94
diff
changeset
|
105 for xp in (xkey, xdate, xurl)))) |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
106 if dkey==xkey and ddate==xdate and durl==xurl: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
107 # Got it |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
108 NF.write(xkey) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
109 NF.write(b' ') |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
110 NF.write(xdate) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
111 NF.write(b' ') |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
112 NF.write(xprops[:-2]) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
113 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
114 if DEBUG: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
115 sys.stderr.write("out_t: %s"%(' '.join(xp.decode('ascii') |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
116 for xp in (xkey, xdate, xurl)))) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
117 sys.stderr.write(" %d\n"%int(dtime[:-3])) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
118 |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
119 dkey, ddate, durl, dtime = nextDate(df,2) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
120 continue |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
121 else: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
122 if dkey and xkey.decode('ascii')>(dkey.decode('ascii')): |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
123 # we've missed something, disaster looms |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
124 print("Fail2:" |
107
40c460fed99f
working on sessionID pblms, still
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
102
diff
changeset
|
125 " xkey: %s\n" |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
126 " dkey: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
127 " xdate: %s\n" |
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
128 " ddate: %s\n" |
109
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
129 " xurl: %s\n" |
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
130 " durl: %s\n" |
102
e606c609f813
reinstate better check to start queuing,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
101
diff
changeset
|
131 "FN: %s XCNT: %s DCNT: %s\n" |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
132 "xl: %s"%(xkey, dkey, xdate, ddate, |
109
52c6a9b0fc8c
loosen must-match criterion in the both-messy case
Henry Thompson <ht@markup.co.uk>
parents:
107
diff
changeset
|
133 xurl, durl, |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
134 FN, XCNT, DCNT, xl), |
113
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
135 file=sys.stderr) |
4a52585a1aac
refactor datestream reading,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
109
diff
changeset
|
136 # try to force recovery |
115
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
137 dkey, ddate, durl, dtime = nextDate(df,3) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
138 continue |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
139 # else fall through to write |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
140 NF.write(xl) |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
141 if DEBUG: |
0b1e6e134aca
robotstxt and crawldiagnostics get free ride,
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
113
diff
changeset
|
142 sys.stderr.write("out_nl\n") |