changeset 284:e461601592dd

try to do the whole thing in one go
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 07 Mar 2025 21:17:47 +0000
parents 6739e08d19ff
children 0ec17b2aab72
files lib/python/cc/lmh/warc2cdb.py
diffstat 1 files changed, 78 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/python/cc/lmh/warc2cdb.py	Fri Mar 07 21:17:47 2025 +0000
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+'''Produce cdb_input-style files from warc responses with lmh header value
+
+   Usage: warc2cdb.py CC-date segment output-dir'''
+
+import re,warc,sys,glob,codecs,os.path
+import cython, typing
+import email.utils
+
+TUPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+DPAT: typing.Pattern[cython.bytes] = re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
+LMPAT: typing.Pattern[cython.bytes] = re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+FFPAT: typing.Pattern[cython.bytes] = re.compile(b'([^ ])GMT$')
+
+DTAB: cython.bytes = bytearray(range(256))
+DDEL: cython.bytes = b'TZ-:'
+
+URI: cython.bytes
+DATE: cython.bytes
+OUT: typing.BinaryIO
+
+def LMHline(wtype: cython.bytes, buf: char[::1] , part: int) -> None:
+  global URI, DATE
+  m: typing.Match[cython.bytes] | None
+  mm: typing.Match[cython.bytes] | None
+  if part==1:
+    if (m:=TUPAT.search(buf)):
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+    if (md:=DPAT.search(buf)):
+      DATE=md[1]
+    else:
+      raise ValueError(b"No date in %s ??"%buf)
+  else:
+    mm=LMPAT.search(buf)
+    if mm:
+      dateTime=mm[1]
+      if dateTime.endswith(b'GMT'):
+        dateTime = FFPAT.sub('\\1 GMT',dateTime)
+      try:
+        try:
+          lmi = b'%d'%int(email.utils.parsedate_to_datetime(dateTime.decode('utf8')).timestamp())
+        except OverflowError:
+          lmi = 32535215999
+      except (TypeError,IndexError,ValueError) as e:
+        print(dateTime.rstrip(),e,sep='\t',file=sys.stderr)
+        return
+      DATE=(DATE.translate(DTAB,DDEL))
+      OUT.write(b'+')
+      OUT.write(b'%d'%(len(DATE)+len(URI)))
+      OUT.write(b',')
+      OUT.write(b'%d'%len(lmi))
+      OUT.write(b':')
+      OUT.write(DATE)
+      OUT.write(URI)
+      OUT.write(b'->')
+      OUT.write(lmi)
+      OUT.write(b'\n')
+
+def main(CCdate, segment, nFiles, outdir):
+  global OUT
+
+  infile_pat='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/warc/*00???.warc.gz'%(
+    CCdate, segment)
+  
+  with open((outfile_name:="%s/%s/lmh.cdb_in"%(outdir,segment)),'wb') as OUT:
+    for infile_name in glob.glob(infile_pat):
+      print(infile_name,file=sys.stderr)
+      warc.warc(infile_name,LMHline,[b'response'],parts=3)
+    OUT.write(b'\n')
+  print(outfile_name)
+
+if __name__ == '__main__':
+  sys.exit(main(*sys.argv[1:]))
+
+
+