Mercurial > hg > cc > cirrus_work
annotate bin/merge_date.py @ 91:460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 05 Sep 2023 17:32:46 +0100 |
parents | c1a70532444c |
children | e56a7aad9ce9 |
rev | line source |
---|---|
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/python3 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Add timestamps from Last-Modified-dated (ks.tsv) files into |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 that year's index |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Usage: merge_date.py ksvstream cdx-dir outdir |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 ''' # ' |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
10 import sys, io, os, os.path, time, re |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 from isal import igzip |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
13 if sys.argv[1] == '-d': |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
14 sys.argv.pop(1) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
15 DEBUG = True |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
16 else: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
17 DEBUG = False |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
18 |
90 | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
21 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
23 b'(crawldiagnostics|robotstxt)/') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
25 b'sid|jsessionid|aspsessionid[a-z]*)' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
26 b'=[^&]*)') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
27 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
28 # Above based on this from fixed Java code: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
29 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
30 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
31 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
32 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
33 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 |
90 | 35 #print(sys.argv[3],NPATH,file=sys.stderr) |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 os.makedirs(sys.argv[3], exist_ok=True) |
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 |
90 | 39 FN = 0 |
40 | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
41 XCNT = 0 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
42 dcnt = 0 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
43 |
90 | 44 XF = igzip.IGzipFile(filename=XPATH%0) |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
45 NF = open(NN:=(NPATH%0),'wb') |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 |
90 | 47 def nextLine(): |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
48 '''Move on to next index file if current has run out''' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
49 global FN, NF, NPATH, NN, XF, XPATH, XCNT |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
50 while True: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
51 xl=XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
52 XCNT += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
53 if xl == b'': |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
54 # need to move to next index file |
90 | 55 FN += 1 |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
56 XF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
57 NF.close() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
58 print(NN, flush=True) # so we can compress it |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
59 time.sleep(0.1) # so it flushes? |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
60 XN=XPATH%FN |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
61 if not os.path.exists(XN): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
62 return |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
63 XF = igzip.IGzipFile(filename=XN) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
64 NF = open((NN:=NPATH%FN), 'wb') |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
65 xl = XF.readline() |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
66 XCNT = 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
67 if RorDPAT.search(xl): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
68 #print(xl,file=sys.stderr) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
69 continue |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
70 return xl |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
71 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
72 def keys(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
73 '''Deal with failure of 2019-35-vintage Java fixup to detect |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
74 parameter-part-initial session ids''' |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
75 if m:=SESSION.match(key): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
76 prefix=m[1] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
77 e, b = m.span(2) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
78 fixed=key[:e]+key[b:] |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
79 if fixed==m[1]: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
80 return prefix[:-1], None |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
81 else: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
82 return prefix, fixed |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
83 else: |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
84 return key, None |
89
a62580816f1c
merge a stream of ks files with a set of cdx files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
85 |
90 | 86 with open(sys.argv[1], 'rb') as df: |
87 dl = df.readline() | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
88 dcnt += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
89 dkey, ddate, dtime = dl.split(b'\t') |
90 | 90 |
91 while (xl:=nextLine()) is not None: | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
92 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
93 xkey1, xkey2 = keys(xkey) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
94 if (ddate != xdate or |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
95 not dkey.startswith(xkey1) or |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
96 (xkey2 is not None and dkey!=xkey2)): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
97 if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')): |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
98 print("Fail: xkey: %s\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
99 " dkey: %s\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
100 " xdate: %s\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
101 " ddate: %s\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
102 "k1, k2: |%s|%s|\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
103 "FN: %s\n" |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
104 "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl), |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
105 file=sys.stderr) |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
106 raise ValueError() |
90 | 107 NF.write(xl) |
108 continue | |
109 NF.write(xkey) | |
110 NF.write(b' ') | |
111 NF.write(xdate) | |
112 NF.write(b' ') | |
113 NF.write(xprops[:-2]) | |
114 NF.write(b', "lastmod": "%d"}\n'%int(dtime[:-3])) | |
115 dl = df.readline() | |
116 if dl == '': | |
117 # write out the last of the last index file, if any | |
118 dkey = ddate = None | |
119 else: | |
91
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
120 dcnt += 1 |
460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
90
diff
changeset
|
121 dkey, ddate, dtime = dl.split(b'\t') |