view bin/lmh_warc.py @ 109:52c6a9b0fc8c

loosen must-match criterion in the both-messy case
author Henry Thompson <ht@markup.co.uk>
date Tue, 19 Sep 2023 19:29:41 +0100
parents 86df63d251cf
children
line wrap: on
line source

#!/usr/bin/env python3
'''Extract identifying info + LastModified header value for all entries
   that have one

   Usage: lmh_warc.py CC-date segment filetype 3-digit-fileno'''

import re,warc,sys,glob,codecs

TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
DPAT=re.compile(b'^WARC-Date: (.*?)\r',re.MULTILINE)
LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)

DTAB=bytearray(range(256))
DDEL=b'TZ-:'

OUT=open(sys.stdout.fileno(),'wb')

def showmeLMH(wtype,buf,part):
  global URI, DATE, SEGMENT, FILETYPE, FILENO
  if part==1:
    if (m:=TUPAT.search(buf)):
      URI=m[1]
    else:
      raise ValueError(b"No target URI in %s ??"%buf)
    if (md:=DPAT.search(buf)):
      DATE=md[1]
    else:
      raise ValueError(b"No date in %s ??"%buf)
  else:
    mm=LMPAT.search(buf)
    OUT.write(URI)
    if mm:
      OUT.write(b'\t')
      OUT.write(DATE.translate(DTAB,DDEL))
      OUT.write(b'\t')
      OUT.write(SEGMENT)
      OUT.write(b'\t')
      OUT.write(FILETYPE)
      OUT.write(b'\t')
      OUT.write(FILENO)
      OUT.write(b'\t')
      OUT.write(mm[1])
    OUT.write(b'\n')

(CCdate, segment, filetype, fileno) = sys.argv[1:]
fn='/beegfs/common_crawl/CC-MAIN-%s/*.%s/orig/%s/*%s.warc.gz'%(
  CCdate, segment, filetype, fileno)

SEGMENT=codecs.encode(segment,'ascii')
FILETYPE=codecs.encode(filetype,'ascii')
FILENO=codecs.encode(fileno,'ascii')

warc.warc(glob.glob(fn)[0],showmeLMH,[b'response'],parts=3)