changeset 297:5e08e6db47ad

add another digit or two (segment #) to key for r_t
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 05 May 2025 20:57:30 +0100
parents 9608fe0628b0
children fdec28613df3
files lib/python/cc/lmh/warc2cdb.py
diffstat 1 files changed, 21 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py	Mon May 05 20:39:16 2025 +0100
+++ b/lib/python/cc/lmh/warc2cdb.py	Mon May 05 20:57:30 2025 +0100
@@ -20,6 +20,8 @@
 URI: bytes
 DATE: bytes
 OUT: typing.BinaryIO
+SEG: bytes
+R_T: boolean = False
 
 WIN: int = 0
 LOSE: int = 0
@@ -39,11 +41,12 @@
 
 codecs.register_error('java_unicode',java_unicode_encode)
 
-def LMHline(wtype: int, buf: memoryview , part: int) -> None:
+def LMHline(wtype: int, buf: memoryview, part: int) -> None:
   global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE
-  global N, UERRS
+  global N, UERRS, SEG, R_T
   m: typing.Match[cython.bytes] | None
   mm: typing.Match[cython.bytes] | None
+  lmi: cython.bytes
   if part==1:
     if (m:=TUPAT.search(buf)):
       URI=m[1]
@@ -65,7 +68,7 @@
         try:
           lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp())
         except OverflowError:
-          lmi = 32535215999
+          lmi = b'32535215999'
       except (TypeError,IndexError,ValueError) as e:
         print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
         LOSE += 1
@@ -78,26 +81,26 @@
         UERRS += 1
         # Try just fixing the non-ASCII:
         URI = URI.decode('utf-8').encode('ascii', errors='java_unicode')
-      l = len(lmi)
-      if wtype == warc.REVISIT:
-        whereami = b" need to fill this in some how"
-        l += len(whereami)
+      l: int = len(lmi)
+      kl: int = (len(DATE)+len(URI)+(len(SEG) if R_T else 0))
       OUT.write(b'+')
-      OUT.write(b'%d'%(len(DATE)+len(URI)))
+      OUT.write(b'%d'%kl)
       OUT.write(b',')
       OUT.write(b'%d'%l)
       OUT.write(b':')
       OUT.write(DATE)
+      if R_T:
+        OUT.write(SEG)
       OUT.write(URI)
       OUT.write(b'->')
       OUT.write(lmi)
-      if wtype == warc.REVISIT:
-        OUT.write(whereami)
       OUT.write(b'\n')
 
 def main(CCdate, segment, outdir, subdir = 'warc', fpat = None):
-  global OUT, N, WIN, LOSE, UERRS, URI, DATE
+  global OUT, N, WIN, LOSE, UERRS, URI, DATE, SEG, R_T
 
+  SEG = segment.encode('utf8')
+  R_T = (subdir == 'robotstxt')
   infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%(
     CCdate, segment, subdir, ("???" if fpat is None else (
       (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat))))
@@ -107,7 +110,13 @@
                                    stdout=subprocess.PIPE).stdout.decode('utf8').split():
       print(infile_name,file=sys.stderr)
       WIN = LOSE = N = UERRS = 0
-      warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3)
+      if subdir in ['warc','robotstxt']:
+        warc.warc(infile_name,LMHline,[warc.RESP],parts=3)
+      elif subdir == 'crawldiagnostics':
+        warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3)
+      else:
+        print('bogus type %s'%subdir,file=sys.stderr)
+        exit(1)
       print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS),
                                                                  file=sys.stderr)
     OUT.write(b'\n')