changeset 300:1c11117bb01b trim

just starting
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 08 May 2025 19:00:26 +0100
parents 83c7ecd61ecf
children 4981c41628dd
files lib/python/cc/lmh/new_key.py
diffstat 1 files changed, 12 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/new_key.py	Thu May 08 19:00:26 2025 +0100
@@ -0,0 +1,12 @@
+#!/usr/bin/python3
+'''Extract/construct a cut-down key for cdb'''
+import re, sys
+
+C_PAT = re.compile('[^ ]* ([^ ]*) .*{"url": "http([^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/')
+
+for l in sys.stdin:
+  if (m:=C_PAT.match(l)):
+    print(m[1],m[2],m[3],m[4])
+  else:
+    print('oops',l)
+    exit(1)