Mercurial > hg > cc > cirrus_home
changeset 177:354dae8aeb80
moved to work tree
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 14:52:00 +0100 |
parents | 97137f5bbe0f |
children | e1bc9d8d688c |
files | bin/warc.py |
diffstat | 1 files changed, 0 insertions(+), 138 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Wed Jul 05 14:50:00 2023 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 -'''Stream a warc format file, invoking a callback on each record. -Callback can be limited by WARC-Type, record part''' -import sys,os,io - -if (debug:=(sys.argv[1]=='-d')): - sys.argv.pop(1) - -def warc(callback,types=['response'],whole=False,parts=7): - types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] - nb=0 - stream=open(sys.argv[1],'rb',0) - bufsize=128*1024*1024 - buf=bytearray(128*1024*1024) - l=b'\r\n' - while not stream.closed: - bp=0 - while l==b'\r\n': - l=stream.readline() - nb+=(ln:=len(l)) - if ln==0: - break - if l!=b'WARC/1.0\r\n': - raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), - l.decode('latin-1'),len(l))) - wtype=None - length=None - state=1 - tr=None # Was this record truncated? - while l!=b'\r\n': - # WARC header - if parts & 1: - buf[bp:(bp:=bp+ln)]=l - l=stream.readline() - nb+=(ln:=len(l)) - if l.startswith(b"Content-Length: "): - length=wl=int(l[16:].rstrip()) - elif l.startswith(b"WARC-Truncated: "): - tr=l[16:].rstrip() - tr="EMPTY" if tr=="" else tr - elif l.startswith(b'WARC-Type: '): - wtype = l[11:-2] - start_2=bp - if (wtype in types): - if whole: - buf[bp:(bp:=bp+ln)]=l - elif (parts & 1): - callback(wtype,buf[:start_2],1) - if parts==1: - start_2=0 - else: - start_2=bp - else: - start_2=0 - bv=memoryview(buf)[start_2:start_2+length] - ii=0 - while True and not stream.closed: - if (i:=stream.readinto(bv))==0: - break - ii+=i - if ii>=length: - break - bv=memoryview(buf)[start_2+ii:start_2+length] - if ii!=length: - raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) - nb+=length - if wtype in types: - if whole: - callback(wtype,buf[0:start_2+length],7) - continue - # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted - bl=None # for HTTP Content-Length for the length of the body? - L_start=start_2 - state=2 - bv=memoryview(buf)[start_2:start_2+length] - with io.BytesIO(bv) as rec_text: - for L in rec_text: - if state==2: - # HTTP header - wl -= len(L) - if not (L==b"" or L.startswith(b"\r")): - # Non-empty, it's (a continuation of) a header - if bl is None and L.startswith(b"Content-Length: "): - bl=int(L[16:].rstrip()) - else: - # Blank line, HTTP header is finished - if parts & 2: - callback(wtype,buf[start_2:start_2+L_start],2) - state=4 - # The above is just for sanity, because we do _not_ - # continue with the outer loop, - # since we can now block-output the entire rest of the - # input buffer. - if bl is not None: - if bl!=wl: - print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) - # HTTP body - balance=start_2+rec_text.tell() - #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) - # Output whatever is left - if parts & 4: - callback(wtype,buf[balance:balance+wl],4) - state=1 - - L_start=rec_text.tell() -OUT=open(sys.stdout.fileno(),'wb') - -import re -TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) -LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) - -def showmeLMH(wtype,buf,part): - global URI - if part==1: - if (m:=TUPAT.search(buf)): - URI=m[1] - else: - raise ValueError(b"No target URI in %s ??"%buf) - else: - m=LMPAT.search(buf) - OUT.write(URI) - if m: - OUT.write(b'\t') - OUT.write(m[1]) - OUT.write(b'\n') - -def showme(wtype,buf,part): - if debug: - breakpoint() - OUT.write(b"%d\n%b"%(part,buf)) - -warc(showmeLMH,[b'response'],parts=3) - -#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) - -#warc(showme,[b'response'],parts=int(sys.argv[2])) -#warc(showme,[b'response'],whole=True)