changeset 303:9e6bc4e941ab trim

sic
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 May 2025 12:06:01 +0100
parents 13414b0dfefb
children e58a1e2be56d
files lib/python/cc/lmh/cdb_one.py
diffstat 1 files changed, 72 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/cdb_one.py	Tue May 13 12:06:01 2025 +0100
@@ -0,0 +1,72 @@
+#!/usr/bin/python3
+# cython: profile=False
+'''Usage: uz .../cdx-....gz | cdb_one.py cdbpat S E | igzip -c > cdc-...gz
+cdbpat identifies a cdb file
+E.g. .../cdb/ks_%d-%d.cdb
+for segments in S:E
+'''
+
+import cython, typing, timeit, re, sys
+from db import CCdb
+
+def mainp() -> None:
+  #PAT: typing.Pattern[cython.bytes] = re.compile(b'\{"url": "([^"]*)",.*, "filename": ".*/segments/[0-9]*\.([0-9][0-9]?)/')
+  C_PAT: typing.Pattern[cython.bytes] = re.compile(b'[^ ]* ([^ ]*) .*{"url": "(http[^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/')
+
+  N: int = 0
+  hits: int = 0
+
+  cdx_in: typing.BinaryIO = sys.stdin.buffer
+  cdx_out: typing.BinaryIO = sys.stdout.buffer
+
+  l: cython.bytes
+  m: typing.Match[cython.bytes] | None
+
+  S: int = int(sys.argv[2])
+  E: int = int(sys.argv[3])
+
+  f = open(sys.argv[1]%(S,E-1),'rb')
+  C = CCdb()
+
+  C.init(f.fileno())
+
+  for l in cdx_in:
+    uri: cython.bytes
+    wdate: cython.bytes
+    kind: cython.bytes
+    segb: cython.bytes
+    seg: int
+    res: int
+    i: int
+    if (m:=C_PAT.match(l)):
+      (wdate, uri, segb, kind) = m.groups()
+    else:
+      raise ValueError(l)
+    N += 1
+    seg = int(segb)
+    if (seg >= S and seg < E):
+      if kind == 'robotstxt':
+        wdate += seg
+      if wdate.startswith(b'201908'):
+        wdate = wdate[6:]
+      if not uri.startswith(b'http'):
+        raise ValueError(uri)
+      if (res := C.find(wdate+uri[4:])) == 1:
+        hits += 1
+        cdx_out.write(memoryview(l)[:-2])
+        cdx_out.write(b', "lastmod": "')
+        cdx_out.write(C.value())
+        cdx_out.write(b'"}\n')
+        continue
+      elif res == 0:
+        pass
+      else:
+        raise ValueError((res,l))
+    cdx_out.write(l)
+  print('%s entries, %s given lastmod'%(N,hits),file=sys.stderr)
+  sys.stderr.flush()
+          
+if __name__ == "__main__":
+    mainp()
+
+