changeset 291:70da637d1402

accommodate to change to digits for record type, minor tweaks, change format of input ranges, still a bug, in 11/...540.warc.gz
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 09 Apr 2025 17:15:40 +0100
parents 52c9d1875608
children a3d55cc7da18
files lib/python/cc/lmh/warc2cdb.py lib/python/cc/test_warc.py lib/python/cc/warc.py
diffstat 3 files changed, 25 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/lib/python/cc/lmh/warc2cdb.py	Wed Apr 09 12:57:50 2025 +0100
+++ b/lib/python/cc/lmh/warc2cdb.py	Wed Apr 09 17:15:40 2025 +0100
@@ -9,20 +9,22 @@
 from urllib.parse import quote
 import subprocess
 
-TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
-DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
-LMPAT: typing.Pattern[cython.bytes] = re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
-FFPAT: typing.Pattern[cython.bytes] = re.compile(b'([^ ])GMT$')
+foo = cython.declare(int, 27)
 
-DTAB: cython.bytes = bytearray(range(256))
-DDEL: cython.bytes = b'TZ-:'
+TUPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r?$',re.MULTILINE)
+DPAT: typing.Pattern[bytes] = re.compile(b'^WARC-Date: (.*?)\r?$',re.MULTILINE)
+LMPAT: typing.Pattern[bytes] = re.compile(b'^Last-Modified: (.*?)\r?$',re.MULTILINE)
+FFPAT: typing.Pattern[bytes] = re.compile(b'([^ ])GMT$')
 
-URI: cython.bytes
-DATE: cython.bytes
+DTAB: bytearray = bytearray(range(256))
+DDEL: bytes = b'TZ-:'
+
+URI: bytes
+DATE: bytes
 OUT: typing.BinaryIO
 
-def LMHline(wtype: cython.bytes, buf: char[::1] , part: int) -> None:
-  global URI, DATE
+def LMHline(_wtype: int, buf: memoryview , part: int) -> None:
+  global URI, DATE, TUPAT, DPAT, LMPAT, FFPAT, DTAB, DDEL, URI, DATE, OUT
   m: typing.Match[cython.bytes] | None
   mm: typing.Match[cython.bytes] | None
   if part==1:
@@ -39,7 +41,8 @@
     if mm:
       dateTime=mm[1]
       if dateTime.endswith(b'GMT'):
-        dateTime = FFPAT.sub(b'\\1 GMT',dateTime)
+        if not dateTime.endswith(b' GMT'):
+          dateTime = dateTime[:-3]+b' GMT' # FFPAT.sub(b'\\1 GMT',dateTime)
       try:
         try:
           lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp())
@@ -65,22 +68,20 @@
       OUT.write(lmi)
       OUT.write(b'\n')
 
-def main(CCdate, segment, outdir, fpat="???"):
+def main(CCdate, segment, outdir, fpat = None):
   global OUT
 
   infile_pat='bash -c "ls /beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00%s.warc.gz | sort -k8"'%(
-    CCdate, segment, fpat)
+    CCdate, segment, ("???" if fpat is None else (
+      (("{%s..%s}"%tuple(fpat.split(','))) if ',' in fpat else fpat))))
   
   with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT:
     for infile_name in subprocess.run(infile_pat, shell=True,
                                    stdout=subprocess.PIPE).stdout.decode('utf8').split():
       print(infile_name,file=sys.stderr)
-      warc.warc(infile_name,LMHline,[b'response'],parts=3)
+      warc.warc(infile_name,LMHline,[1],parts=3)
     OUT.write(b'\n')
   print(outfile_name)
 
 if __name__ == '__main__':
   sys.exit(main(*sys.argv[1:]))
-
-
-
--- a/lib/python/cc/test_warc.py	Wed Apr 09 12:57:50 2025 +0100
+++ b/lib/python/cc/test_warc.py	Wed Apr 09 17:15:40 2025 +0100
@@ -22,12 +22,12 @@
   return OUT
 
 if tt==1:
-  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],parts=int(sys.argv[2]),debug=debug)
+  warc.warc(sys.argv[1],showme,[1,2,3,0],parts=int(sys.argv[2]),debug=debug)
 elif tt==2:
-  warc.warc(sys.argv[1],showme,[b'warcinfo'],parts=int(sys.argv[2]),debug=debug)
+  warc.warc(sys.argv[1],showme,[0],parts=int(sys.argv[2]),debug=debug)
 elif tt==3:
-  warc.warc(sys.argv[1],showme,[b'warcinfo'],whole=True,debug=debug)
+  warc.warc(sys.argv[1],showme,[0],whole=True,debug=debug)
 elif tt==4:
-  warc.warc(sys.argv[1],showme,[b'response','warcinfo','request','metadata'],whole=True,debug=debug)
+  warc.warc(sys.argv[1],showme,[1,2,3,0],whole=True,debug=debug)
 elif tt==5:
-  warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
+  warc.warc(sys.argv[1],showme,[1],parts=int(sys.argv[2]),debug=debug)
--- a/lib/python/cc/warc.py	Wed Apr 09 12:57:50 2025 +0100
+++ b/lib/python/cc/warc.py	Wed Apr 09 17:15:40 2025 +0100
@@ -103,12 +103,13 @@
       if whole:
         bp+=length
         _out=callback(wtype,bufView[start_1:bp],7)
-        continue
       elif (parts & 1):
         _out=callback(wtype,bufView[start_1:eol],1)
         bp = eol
       while buf.startswith(b'\r\n',bp):
         bp+=2
+      if whole:
+        return
       if parts!=1:
         start_2=bp
         eob=bp+length