changeset 299:83c7ecd61ecf trim

try trimming various more-or-less constant bits of the key and value
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 06 May 2025 16:52:32 +0100
parents fdec28613df3
children 1c11117bb01b
files lib/python/cc/lmh/warc2cdb.py
diffstat 1 files changed, 34 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py	Mon May 05 20:57:46 2025 +0100
+++ b/lib/python/cc/lmh/warc2cdb.py	Tue May 06 16:52:32 2025 +0100
@@ -17,16 +17,20 @@
 DTAB: bytearray = bytearray(range(256))
 DDEL: bytes = b'TZ-:'
 
+OUT: typing.BinaryIO
+SEG: bytes
+R_T: bool = False
+
 URI: bytes
 DATE: bytes
-OUT: typing.BinaryIO
-SEG: bytes
-R_T: boolean = False
 
 WIN: int = 0
 LOSE: int = 0
 N: int = 0
 UERRS: int = 0
+NON_HTTP: int = 0
+NON_MONTH: int = 0
+NON_ERA: int = 0
 
 def _u_esc(c):
   if c<65536:
@@ -42,8 +46,9 @@
 codecs.register_error('java_unicode',java_unicode_encode)
 
 def LMHline(wtype: int, buf: memoryview, part: int) -> None:
-  global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE
-  global N, UERRS, SEG, R_T
+  global TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, OUT, WIN, LOSE, NON_HTTP, NON_MONTH
+  global N, UERRS, SEG, R_T, LM_ERA, LM_ERA_L, C_MONTH, C_MONTH_L, NON_ERA
+  global DATE, URI
   m: typing.Match[cython.bytes] | None
   mm: typing.Match[cython.bytes] | None
   lmi: cython.bytes
@@ -67,6 +72,11 @@
       try:
         try:
           lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp())
+          if LM_ERA:
+            if lmi.startswith(LM_ERA): # save 2 bytes in ~80% of cases
+              lmi=b'0'+lmi[LM_ERA_L:]
+            else:
+              NON_ERA += 1
         except OverflowError:
           lmi = b'32535215999'
       except (TypeError,IndexError,ValueError) as e:
@@ -74,6 +84,11 @@
         LOSE += 1
         return
       DATE=(DATE.translate(DTAB,DDEL))
+      if C_MONTH:
+        if DATE.startswith(C_MONTH):
+          DATE=DATE[C_MONTH_L:]
+        else:
+          NON_MONTH += 1
       WIN += 1
       try:
         URI.decode('ascii')
@@ -81,6 +96,11 @@
         UERRS += 1
         # Try just fixing the non-ASCII:
         URI = URI.decode('utf-8').encode('ascii', errors='java_unicode')
+      # Could just assume http, but let's check
+      if URI.startswith(b'http'):
+        URI=URI[4:]
+      else:
+        NON_HTTP += 1
       l: int = len(lmi)
       kl: int = (len(DATE)+len(URI)+(len(SEG) if R_T else 0))
       OUT.write(b'+')
@@ -96,11 +116,16 @@
       OUT.write(lmi)
       OUT.write(b'\n')
 
-def main(CCdate, segment, outdir, subdir = 'warc', fpat = None):
-  global OUT, N, WIN, LOSE, UERRS, URI, DATE, SEG, R_T
+def main(CCdate, segment, outdir, subdir = 'warc', fpat = None, era = '', month = '' ):
+  global OUT, N, WIN, LOSE, UERRS, SEG, R_T
+  global NON_HTTP, C_MONTH, C_MONTH_L, NON_MONTH, LM_ERA, LM_ERA_L, NON_ERA
 
   SEG = segment.encode('utf8')
   R_T = (subdir == 'robotstxt')
+  LM_ERA = era.encode('utf8')
+  LM_ERA_L = len(era)
+  C_MONTH = month.encode('utf8')
+  C_MONTH_L = len(month)
   infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%(
     CCdate, segment, subdir, ("???" if fpat is None else (
       (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat))))
@@ -109,7 +134,7 @@
     for infile_name in subprocess.run(infile_pat, shell=True,
                                    stdout=subprocess.PIPE).stdout.decode('utf8').split():
       print(infile_name,file=sys.stderr)
-      WIN = LOSE = N = UERRS = 0
+      WIN = LOSE = N = UERRS = NON_HTTP = NON_MONTH = NON_ERA = 0
       if subdir in ['warc','robotstxt']:
         warc.warc(infile_name,LMHline,[warc.RESP],parts=3)
       elif subdir == 'crawldiagnostics':
@@ -117,7 +142,7 @@
       else:
         print('bogus type %s'%subdir,file=sys.stderr)
         exit(1)
-      print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS),
+      print('%d LM headers, %d win, %d lose, %d non-ASCII URIs, %d non-close, %d dodgy schemes, %d dodgy WARC dates'%(N,WIN,LOSE,UERRS,NON_ERA,NON_HTTP,NON_MONTH),
                                                                  file=sys.stderr)
     OUT.write(b'\n')