# HG changeset patch # User Henry S. Thompson # Date 1688566197 -3600 # Node ID 8661062a50b1f204452314a64bc61dd092ee6569 # Parent e3c440666f1a5e31b084156340e112681d7826ef moved from home bin diff -r e3c440666f1a -r 8661062a50b1 bin/warc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/warc.py Wed Jul 05 15:09:57 2023 +0100 @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +'''Stream a warc format file, invoking a callback on each record. +Callback can be limited by WARC-Type, record part''' +import sys,os,io + +if (debug:=(sys.argv[1]=='-d')): + sys.argv.pop(1) + +def warc(callback,types=['response'],whole=False,parts=7): + types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] + nb=0 + stream=open(sys.argv[1],'rb',0) + bufsize=128*1024*1024 + buf=bytearray(128*1024*1024) + l=b'\r\n' + while not stream.closed: + bp=0 + while l==b'\r\n': + l=stream.readline() + nb+=(ln:=len(l)) + if ln==0: + break + if l!=b'WARC/1.0\r\n': + raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), + l.decode('latin-1'),len(l))) + wtype=None + length=None + state=1 + tr=None # Was this record truncated? + while l!=b'\r\n': + # WARC header + if parts & 1: + buf[bp:(bp:=bp+ln)]=l + l=stream.readline() + nb+=(ln:=len(l)) + if l.startswith(b"Content-Length: "): + length=wl=int(l[16:].rstrip()) + elif l.startswith(b"WARC-Truncated: "): + tr=l[16:].rstrip() + tr="EMPTY" if tr=="" else tr + elif l.startswith(b'WARC-Type: '): + wtype = l[11:-2] + start_2=bp + if (wtype in types): + if whole: + buf[bp:(bp:=bp+ln)]=l + elif (parts & 1): + callback(wtype,buf[:start_2],1) + if parts==1: + start_2=0 + else: + start_2=bp + else: + start_2=0 + bv=memoryview(buf)[start_2:start_2+length] + ii=0 + while True and not stream.closed: + if (i:=stream.readinto(bv))==0: + break + ii+=i + if ii>=length: + break + bv=memoryview(buf)[start_2+ii:start_2+length] + if ii!=length: + raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) + nb+=length + if wtype in types: + if whole: + callback(wtype,buf[0:start_2+length],7) + continue + # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted + bl=None # for HTTP Content-Length for the length of the body? + L_start=start_2 + state=2 + bv=memoryview(buf)[start_2:start_2+length] + with io.BytesIO(bv) as rec_text: + for L in rec_text: + if state==2: + # HTTP header + wl -= len(L) + if not (L==b"" or L.startswith(b"\r")): + # Non-empty, it's (a continuation of) a header + if bl is None and L.startswith(b"Content-Length: "): + bl=int(L[16:].rstrip()) + else: + # Blank line, HTTP header is finished + if parts & 2: + callback(wtype,buf[start_2:start_2+L_start],2) + state=4 + # The above is just for sanity, because we do _not_ + # continue with the outer loop, + # since we can now block-output the entire rest of the + # input buffer. + if bl is not None: + if bl!=wl: + print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ + (length,offset,filename,wl,bl,tr),file=sys.stderr) + # HTTP body + balance=start_2+rec_text.tell() + #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) + # Output whatever is left + if parts & 4: + callback(wtype,buf[balance:balance+wl],4) + state=1 + + L_start=rec_text.tell() +OUT=open(sys.stdout.fileno(),'wb') + +import re +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +def showmeLMH(wtype,buf,part): + global URI + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + else: + m=LMPAT.search(buf) + OUT.write(URI) + if m: + OUT.write(b'\t') + OUT.write(m[1]) + OUT.write(b'\n') + +def showme(wtype,buf,part): + if debug: + breakpoint() + OUT.write(b"%d\n%b"%(part,buf)) + +warc(showmeLMH,[b'response'],parts=3) + +#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) + +#warc(showme,[b'response'],parts=int(sys.argv[2])) +#warc(showme,[b'response'],whole=True)