Mercurial > hg > cc > cirrus_work
comparison bin/merge_date.py @ 91:460f0599e8cd
mostly working, but need to reorder in case of cfid and friends
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 05 Sep 2023 17:32:46 +0100 |
parents | c1a70532444c |
children | e56a7aad9ce9 |
comparison
equal
deleted
inserted
replaced
90:c1a70532444c | 91:460f0599e8cd |
---|---|
5 Usage: merge_date.py ksvstream cdx-dir outdir | 5 Usage: merge_date.py ksvstream cdx-dir outdir |
6 | 6 |
7 ksvstream consists of tab-separated key, CC date and Unix timestamp | 7 ksvstream consists of tab-separated key, CC date and Unix timestamp |
8 ''' # ' | 8 ''' # ' |
9 | 9 |
10 import sys, io, os, os.path | 10 import sys, io, os, os.path, time, re |
11 from isal import igzip | 11 from isal import igzip |
12 | 12 |
13 if sys.argv[1] == '-d': | |
14 sys.argv.pop(1) | |
15 DEBUG = True | |
16 else: | |
17 DEBUG = False | |
18 | |
13 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] | 19 XPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[2] |
14 NPATH = "%s/cdx-00%%0.3d.gz"%sys.argv[3] | 20 NPATH = "%s/cdx-00%%0.3d"%sys.argv[3] |
21 | |
22 RorDPAT = re.compile(b'", "filename": "crawl-data/[A-Z0-9-]*/segments/[0-9.]*/' | |
23 b'(crawldiagnostics|robotstxt)/') | |
24 SESSION = re.compile(b'([^?]*\?)((cfid=[^&]*&cftoken|' | |
25 b'sid|jsessionid|aspsessionid[a-z]*)' | |
26 b'=[^&]*)') | |
27 | |
28 # Above based on this from fixed Java code: | |
29 #(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", "jsessionid=", 1, 2), | |
30 #(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", "phpsessid=", 1, 2), | |
31 #(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", "sid=", 1, 2), | |
32 #(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", "aspsessionid", 1, 2), | |
33 #(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", | |
15 | 34 |
16 #print(sys.argv[3],NPATH,file=sys.stderr) | 35 #print(sys.argv[3],NPATH,file=sys.stderr) |
17 | 36 |
18 os.makedirs(sys.argv[3], exist_ok=True) | 37 os.makedirs(sys.argv[3], exist_ok=True) |
19 | 38 |
20 FN = 0 | 39 FN = 0 |
21 | 40 |
41 XCNT = 0 | |
42 dcnt = 0 | |
43 | |
22 XF = igzip.IGzipFile(filename=XPATH%0) | 44 XF = igzip.IGzipFile(filename=XPATH%0) |
23 NF = open(NPATH%0,'wb') | 45 NF = open(NN:=(NPATH%0),'wb') |
24 | |
25 XL = b'' | |
26 | 46 |
27 def nextLine(): | 47 def nextLine(): |
28 global FN, NF, NPATH, XF, XPATH | 48 '''Move on to next index file if current has run out''' |
29 xl=XF.readline() | 49 global FN, NF, NPATH, NN, XF, XPATH, XCNT |
30 if xl == b'': | 50 while True: |
31 # need to move to next index file | 51 xl=XF.readline() |
32 if NF is None: | 52 XCNT += 1 |
33 FN = 0 | 53 if xl == b'': |
54 # need to move to next index file | |
55 FN += 1 | |
56 XF.close() | |
57 NF.close() | |
58 print(NN, flush=True) # so we can compress it | |
59 time.sleep(0.1) # so it flushes? | |
60 XN=XPATH%FN | |
61 if not os.path.exists(XN): | |
62 return | |
63 XF = igzip.IGzipFile(filename=XN) | |
64 NF = open((NN:=NPATH%FN), 'wb') | |
65 xl = XF.readline() | |
66 XCNT = 1 | |
67 if RorDPAT.search(xl): | |
68 #print(xl,file=sys.stderr) | |
69 continue | |
70 return xl | |
71 | |
72 def keys(key): | |
73 '''Deal with failure of 2019-35-vintage Java fixup to detect | |
74 parameter-part-initial session ids''' | |
75 if m:=SESSION.match(key): | |
76 prefix=m[1] | |
77 e, b = m.span(2) | |
78 fixed=key[:e]+key[b:] | |
79 if fixed==m[1]: | |
80 return prefix[:-1], None | |
34 else: | 81 else: |
35 FN += 1 | 82 return prefix, fixed |
36 xn=XPATH%FN | 83 else: |
37 if not os.path.exists(xn): | 84 return key, None |
38 return | |
39 XF = igzip.IGzipFile(filename=xn) | |
40 NF = open(NPATH%FN, 'wb') | |
41 xl = XF.readline() | |
42 return xl | |
43 | 85 |
44 with open(sys.argv[1], 'rb') as df: | 86 with open(sys.argv[1], 'rb') as df: |
45 dl = df.readline() | 87 dl = df.readline() |
46 (dkey, ddate, dtime) = dl.split(b'\t') | 88 dcnt += 1 |
89 dkey, ddate, dtime = dl.split(b'\t') | |
47 | 90 |
48 while (xl:=nextLine()) is not None: | 91 while (xl:=nextLine()) is not None: |
49 (xkey, xdate, xprops) = xl.split(b' ', maxsplit=2) | 92 xkey, xdate, xprops = xl.split(b' ', maxsplit=2) |
50 if dkey != xkey or ddate != xdate: | 93 xkey1, xkey2 = keys(xkey) |
94 if (ddate != xdate or | |
95 not dkey.startswith(xkey1) or | |
96 (xkey2 is not None and dkey!=xkey2)): | |
97 if DEBUG and xkey.decode('ascii')>(dkey.decode('ascii')): | |
98 print("Fail: xkey: %s\n" | |
99 " dkey: %s\n" | |
100 " xdate: %s\n" | |
101 " ddate: %s\n" | |
102 "k1, k2: |%s|%s|\n" | |
103 "FN: %s\n" | |
104 "xl: %s"%(xkey, dkey, xdate, ddate, xkey1, xkey2, FN, xl), | |
105 file=sys.stderr) | |
106 raise ValueError() | |
51 NF.write(xl) | 107 NF.write(xl) |
52 continue | 108 continue |
53 NF.write(xkey) | 109 NF.write(xkey) |
54 NF.write(b' ') | 110 NF.write(b' ') |
55 NF.write(xdate) | 111 NF.write(xdate) |
59 dl = df.readline() | 115 dl = df.readline() |
60 if dl == '': | 116 if dl == '': |
61 # write out the last of the last index file, if any | 117 # write out the last of the last index file, if any |
62 dkey = ddate = None | 118 dkey = ddate = None |
63 else: | 119 else: |
64 (dkey, ddate, dtime) = dl.split(b'\t') | 120 dcnt += 1 |
121 dkey, ddate, dtime = dl.split(b'\t') |