Mercurial > hg > cc > cirrus_work
changeset 303:9e6bc4e941ab trim
sic
| author | Henry S. Thompson <ht@inf.ed.ac.uk> |
|---|---|
| date | Tue, 13 May 2025 12:06:01 +0100 |
| parents | 13414b0dfefb |
| children | e58a1e2be56d |
| files | lib/python/cc/lmh/cdb_one.py |
| diffstat | 1 files changed, 72 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/lmh/cdb_one.py Tue May 13 12:06:01 2025 +0100 @@ -0,0 +1,72 @@ +#!/usr/bin/python3 +# cython: profile=False +'''Usage: uz .../cdx-....gz | cdb_one.py cdbpat S E | igzip -c > cdc-...gz +cdbpat identifies a cdb file +E.g. .../cdb/ks_%d-%d.cdb +for segments in S:E +''' + +import cython, typing, timeit, re, sys +from db import CCdb + +def mainp() -> None: + #PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/') + C_PAT: typing.Pattern[cython.bytes] = re.compile(b'[^ ]* ([^ ]*) .*{"url": "(http[^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/') + + N: int = 0 + hits: int = 0 + + cdx_in: typing.BinaryIO = sys.stdin.buffer + cdx_out: typing.BinaryIO = sys.stdout.buffer + + l: cython.bytes + m: typing.Match[cython.bytes] | None + + S: int = int(sys.argv[2]) + E: int = int(sys.argv[3]) + + f = open(sys.argv[1]%(S,E-1),'rb') + C = CCdb() + + C.init(f.fileno()) + + for l in cdx_in: + uri: cython.bytes + wdate: cython.bytes + kind: cython.bytes + segb: cython.bytes + seg: int + res: int + i: int + if (m:=C_PAT.match(l)): + (wdate, uri, segb, kind) = m.groups() + else: + raise ValueError(l) + N += 1 + seg = int(segb) + if (seg >= S and seg < E): + if kind == 'robotstxt': + wdate += seg + if wdate.startswith(b'201908'): + wdate = wdate[6:] + if not uri.startswith(b'http'): + raise ValueError(uri) + if (res := C.find(wdate+uri[4:])) == 1: + hits += 1 + cdx_out.write(memoryview(l)[:-2]) + cdx_out.write(b', "lastmod": "') + cdx_out.write(C.value()) + cdx_out.write(b'"}\n') + continue + elif res == 0: + pass + else: + raise ValueError((res,l)) + cdx_out.write(l) + print('%s entries, %s given lastmod'%(N,hits),file=sys.stderr) + sys.stderr.flush() + +if __name__ == "__main__": + mainp() + +
