view bin/warc.py @ 159:c3c3dd60b8a8

demo of slurm usage using cdx2tsv.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Jul 2022 18:07:34 +0100
parents e96d444b0f84
children d123ef7fdb82
line wrap: on
line source

#!/usr/bin/env python3
'''Stream a warc format file, invoking a callback on each part.
Callback can be limited by WARC-Type'''
import sys,os

def warc(callback,types=['response']):
  nb=0
  stream=open(sys.argv[1],'rb',0)
  bufsize=128*1024*1024
  buf=bytearray(128*1024*1024)
  l=b'\r\n'
  while True:
    while l==b'\r\n':
      l=stream.readline()
      nb+=len(l)
    if l!=b'WARC/1.0\r\n':
      if l==b'':
        return
      raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
                                                         l.decode('latin-1'),len(l)))
    wtype=None
    length=None
    while l!=b'\r\n':
      l=stream.readline()
      nb+=len(l)
      if l.startswith(b'WARC-Type: '):
        wtype = l[11:-2]
      elif l.startswith(b'Content-Length: '):
        length = int(l[16:])
    bv=memoryview(buf)[:length]
    ii=0
    while True:
      i=stream.readinto(bv)
      ii+=i
      if ii>=length:
        break
      bv=memoryview(buf)[ii:length]
    if ii!=length:
      raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
    nb+=length
    if wtype in types:
      callback(wtype,memoryview(buf[:length]))
  if whole and options.zipped:
    _output(bv)
    return
  gzip_chunk = io.BytesIO(bv)
  uv=memoryview(buf)[length:]
  with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
    ll=0
    while True:
      l=gzip_fin.readinto(uv)
      if not l:
        break
      ll+=l
    cb=memoryview(uv)[:ll]
    if whole:
      _output(cb)
      return
  # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
  state=0
  tr=None # Was this record truncated?
  bl=None # for HTTP Content-Length for the length of the body?
  with io.BytesIO(cb) as clear_text:
    for L in clear_text:
      if state==0:
        # WARC header
        if L.startswith(b"Content-Length: "):
          wl=int(L[16:].rstrip())
        elif L.startswith(b"WARC-Truncated: "):
          tr=L[16:].rstrip()
          tr="EMPTY" if tr=="" else tr
        elif L==b"" or L.startswith(b"\r"): # for idempotency
          # Blank line, WARC header is finished
          if not (options.headers or options.body):
            return
          state=1
          # Note we preserve the empty line
        if options.warc:
          _output(L)
        continue
      if state==1:
        # HTTP header
        wl -= len(L)
        if not (L==b"" or L.startswith(b"\r")):
          # Non-blank, it's a header
          if bl is None and L.startswith(b"Content-Length: "):
            bl=int(L[16:].rstrip())
          if options.headers:
            _output(L)
        else:
          # Blank line, HTTP header is finished
          if not options.body:
            return
          if options.headers:
            _output(L)
          state=2
          # The above is just for sanity, because we do _not_
          #  continue with the outer loop,
          #  since we can now block-output the entire rest of the
          #  input buffer.
          if bl is not None:
            if bl!=wl:
              print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
                    (length,offset,filename,wl,bl,tr),file=sys.stderr)
          # HTTP body
          balance=clear_text.tell()
          #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
          # Output whatever is left
          _output(cb[balance:balance+wl])
          return

OUT=open(sys.stdout.fileno(),'wb')

import re
LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)

def showmeLMH(wtype,buf):
  m=LMPAT.search(buf.tobytes(order='A'))
  if m:
    OUT.write(m[1])
  OUT.write(b'\n')

def showme(wtype,buf):
  OUT.write(buf)

warc(showmeLMH,[b'response'])