changeset 301:4981c41628dd trim

works
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 13 May 2025 12:04:01 +0100
parents 1c11117bb01b
children 13414b0dfefb
files lib/python/cc/lmh/new_key.py
diffstat 1 files changed, 15 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/new_key.py	Thu May 08 19:00:26 2025 +0100
+++ b/lib/python/cc/lmh/new_key.py	Tue May 13 12:04:01 2025 +0100
@@ -2,11 +2,23 @@
 '''Extract/construct a cut-down key for cdb'''
 import re, sys
 
-C_PAT = re.compile('[^ ]* ([^ ]*) .*{"url": "http([^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/')
+C_PAT = re.compile('[^ ]* ([^ ]*) .*{"url": "(http[^"]*).*"filename": "[^"]*\.([0-9][0-9]?)/(warc|robotstxt|crawldiagnostics)/')
 
 for l in sys.stdin:
   if (m:=C_PAT.match(l)):
-    print(m[1],m[2],m[3],m[4])
+    (wdate, uri, seg, kind) = m.groups()
   else:
-    print('oops',l)
+    print('oops',file=sys.stderr)
+    print(l,file=sys.stderr)
     exit(1)
+
+  if kind == 'robotstxt':
+    wdate += seg
+  if wdate.startswith('201908'):
+    wdate = wdate[6:]
+  if uri.startswith('http'):
+    print(wdate,uri[4:],sep='')
+  else:
+    print("should't happen",uri,file=sys.stderr)
+    exit(2)
+