changeset 293:12d13a1d387f

extend, then fix, to get it working for crawldiagnostics warc files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 18 Apr 2025 13:39:55 +0100
parents a3d55cc7da18
children cc2945816b75
files lib/python/cc/lmh/warc2cdb.py lib/python/cc/test_warc.py lib/python/cc/warc.py
diffstat 3 files changed, 83 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py	Wed Apr 09 20:42:29 2025 +0100
+++ b/lib/python/cc/lmh/warc2cdb.py	Fri Apr 18 13:39:55 2025 +0100
@@ -3,14 +3,12 @@
 
    Usage: warc2cdb.py CC-date segment output-dir'''
 
-import re,warc,sys,glob,codecs,os.path
+import re, warc, sys, glob, codecs, os.path
 import cython, typing
 import email.utils
 from urllib.parse import quote
 import subprocess
 
-foo = cython.declare(int, 27)
-
 TUPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r?$',re.MULTILINE)
 DPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Date: (.*?)\r?$',re.MULTILINE)
 LMPAT: typing.Pattern[bytes] = re.compile(b'^Last-Modified: (.*?)\r?$',re.MULTILINE)
@@ -23,8 +21,27 @@
 DATE: bytes
 OUT: typing.BinaryIO
 
-def LMHline(_wtype: int, buf: memoryview , part: int) -> None:
-  global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT
+WIN: int = 0
+LOSE: int = 0
+N: int = 0
+UERRS: int = 0
+
+def _u_esc(c):
+  if c<65536:
+    return '\\u%04X'%c
+  else:
+    return '\\U%08X'%c
+
+def java_unicode_encode(ude):
+  '''like backslashreplace but use uppercase and \ u00NN instead of \ xnn'''
+  return (''.join(_u_esc(ord(c)) for c in ude.object[ude.start:ude.end]),
+          ude.end)
+
+codecs.register_error('java_unicode',java_unicode_encode)
+
+def LMHline(wtype: int, buf: memoryview , part: int) -> None:
+  global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT, WIN, LOSE
+  global N, UERRS
   m: typing.Match[cython.bytes] | None
   mm: typing.Match[cython.bytes] | None
   if part==1:
@@ -39,6 +56,7 @@
   else:
     mm=LMPAT.search(buf)
     if mm:
+      N += 1
       dateTime=mm[1]
       if dateTime.endswith(b'GMT'):
         if not dateTime.endswith(b' GMT'):
@@ -50,37 +68,50 @@
           lmi = 32535215999
       except (TypeError,IndexError,ValueError) as e:
         print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+        LOSE += 1
         return
       DATE=(DATE.translate(DTAB,DDEL))
+      WIN += 1
       try:
         URI.decode('ascii')
       except UnicodeDecodeError:
-        URI=quote(URI, safe='''!"$&'()*+,-./:;<=>?@[\]^_`{|}~\x7f'''  #')
-                  ).encode('ascii')
+        UERRS += 1
+        # Try just fixing the non-ASCII:
+        URI = URI.decode('utf-8').encode('ascii', errors='java_unicode')
+      l = len(lmi)
+      if wtype == warc.REVISIT:
+        whereami = b" need to fill this in some how"
+        l += len(whereami)
       OUT.write(b'+')
       OUT.write(b'%d'%(len(DATE)+len(URI)))
       OUT.write(b',')
-      OUT.write(b'%d'%len(lmi))
+      OUT.write(b'%d'%l)
       OUT.write(b':')
       OUT.write(DATE)
       OUT.write(URI)
       OUT.write(b'->')
       OUT.write(lmi)
+      if wtype == warc.REVISIT:
+        OUT.write(whereami)
       OUT.write(b'\n')
 
-def main(CCdate, segment, outdir, fpat = None):
-  global OUT
+def main(CCdate, segment, outdir, subdir = 'warc', fpat = None):
+  global OUT, N, WIN, LOSE, UERRS, URI, DATE
 
-  infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%(
-    CCdate, segment, ("???" if fpat is None else (
+  infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*00%s.warc.gz | sort -k8"'%(
+    CCdate, segment, subdir, ("???" if fpat is None else (
       (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat))))
   
-  with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT:
+  with open((outfile_name:="%s/%s/%s/lmh.cdb_in"%(outdir, segment, subdir)),'wb') as OUT:
     for infile_name in subprocess.run(infile_pat, shell=True,
                                    stdout=subprocess.PIPE).stdout.decode('utf8').split():
       print(infile_name,file=sys.stderr)
-      warc.warc(infile_name,LMHline,[1],parts=3)
+      WIN = LOSE = N = UERRS = 0
+      warc.warc(infile_name,LMHline,[warc.RESP, warc.REVISIT],parts=3)
+      print('%d LM headers, %d win, %d lose, %d non-ASCII URIs'%(N,WIN,LOSE,UERRS),
+                                                                 file=sys.stderr)
     OUT.write(b'\n')
+
   print(outfile_name)
 
 if __name__ == '__main__':
--- a/lib/python/cc/test_warc.py	Wed Apr 09 20:42:29 2025 +0100
+++ b/lib/python/cc/test_warc.py	Fri Apr 18 13:39:55 2025 +0100
@@ -28,6 +28,8 @@
 elif tt==3:
   warc.warc(sys.argv[1],showme,[0],whole=True,debug=debug)
 elif tt==4:
-  warc.warc(sys.argv[1],showme,[1,2,3,0],whole=True,debug=debug)
+  warc.warc(sys.argv[1],showme,[1,2,3,0,4],whole=True,debug=debug)
 elif tt==5:
   warc.warc(sys.argv[1],showme,[1],parts=int(sys.argv[2]),debug=debug)
+elif tt==6:
+  warc.warc(sys.argv[1],showme,[4],parts=int(sys.argv[2]),debug=debug)
--- a/lib/python/cc/warc.py	Wed Apr 09 20:42:29 2025 +0100
+++ b/lib/python/cc/warc.py	Fri Apr 18 13:39:55 2025 +0100
@@ -13,6 +13,7 @@
 RESP: int = 1
 REQ: int =  2
 META: int = 3
+REVISIT: int = 4
 
 BUFSIZE: int = 16 * 1024 * 1024
 BUFMIN: int = 3 * 512 * 1024 # 1.5MiB, will need to be increased
@@ -54,7 +55,7 @@
   done: bool = bl < BUFSIZE 
   while buf.startswith(b'\r\n',bp):
     bp+=2
-  while not (done and bp <= bl):
+  while not (done and bp >= bl):
     start_1: int = bp
     if not buf.startswith(b'WARC/1.0\r\n',bp):
       breakpoint()
@@ -87,6 +88,8 @@
           wtype = META
         elif buf.startswith(b'w',bp+11):
           wtype = INFO
+        elif buf.startswith(b'v',bp+13):
+          wtype = REVISIT
         else:
           raise ValueError("Unknown WARC-Type: %s in %s at %s"%(
                              bytes(bufView[bp+11:eol-2]),filename,
@@ -102,32 +105,37 @@
     if (wtype in types):
       # Output whole or part 1 as required
       if whole:
-        bp+=length
-        _out=callback(wtype,bufView[start_1:bp],7)
-      elif (parts & 1):
-        _out=callback(wtype,bufView[start_1:eol],1)
-        bp = eol
-      while buf.startswith(b'\r\n',bp):
-        bp+=2
-      if whole:
-        return
-      if parts!=1:
-        start_2=bp
-        eob=bp+length
-        while buf.startswith(b'\r\n',eob-2):
-          eob-=2
-        # Only output parts (2 = HTTP header, 4 = body) that are wanted
-        if parts & 2:
-          if wtype == META or wtype == INFO:
-            # rest of the part
-            _out=callback(wtype,bufView[start_2:eob],2)
-          else:
-            # request and response have http headers
-            eo2=buf.index(b'\r\n\r\n',start_2)
-            _out=callback(wtype,bufView[start_2:eo2+2],2)
-        if parts & 4:
-          raise ValueError("Not implemented: body part (4): %s"%parts)
-    bp += length
+        _out=callback(wtype,bufView[start_1:bp+length],7)
+      else:
+        if (parts & 1):
+          bp = eol+2
+          _out=callback(wtype,bufView[start_1:bp],1)
+        if parts!=1:
+          while buf.startswith(b'\r\n',bp):
+            bp+=2
+          start_2=bp
+          eob=bp+length
+          while buf.startswith(b'\r\n',eob-2):
+            eob-=2
+          # Only output parts (2 = HTTP header, 4 = body) that are wanted
+          if parts & 2:
+            if wtype == RESP or wtype == REQ :
+              # request and response have http headers
+              eo2=buf.index(b'\r\n\r\n',start_2)
+              _out=callback(wtype,bufView[start_2:eo2+2],2)
+            else:
+              # rest of the part
+              _out=callback(wtype,bufView[start_2:eob],2)
+          if parts & 4:
+            raise ValueError("Not implemented: body part (4): %s"%parts)
+    #bp += length
+    #if buf[bp] != 13:
+    #  # Why does this sometimes happen, e.g. when doing
+    #  python3 ~/lib/python/cc/test_warc.py 4 /beegfs/common_crawl/CC-MAIN-2019-35/1566027313501.0/orig/crawldiagnostics/CC-MAIN-20190817222907-20190818004907-00000.warc.gz
+    #  at a point where bp+length is 11018, looking at >\n\r\n
+    #  bp += 1 [doesn't work]
+    bp = buf.index(b'\r\n',bp+length)
+    # check if refill needed
     rl: int
     if (rl := (bp - start_1)) > RECORDMAX:
       RECORDMAX = rl