Mercurial > hg > cc > cirrus_work
view bin/warc.py @ 53:0dc144bd027c
made 1 mean 1, still losing after a while
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 19:04:16 +0100 |
parents | c0b4359dd26a |
children | 9c63039a9b6d |
line wrap: on
line source
#!/usr/bin/env python3 '''Stream a warc format file, unzipping if necessary, invoking a callback on each record. Callback can be limited by WARC-Type, record part''' import sys,io from isal import igzip def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 if filename.endswith(".gz"): stream=igzip.IGzipFile(filename=filename) else: stream=open(filename,'rb',0) bufSize=2*1024*1024 hdrMax=16*1024 buf=bytearray(bufSize) hdrBuf=memoryview(buf)[:hdrMax] fpos=bl=stream.readinto(hdrBuf) while True: bp=0 while buf.startswith(b'\r\n',bp): bp+=2 bob=bp if not buf.startswith(b'WARC/1.0\r\n',bp): raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) bp+=10 wtype=None length=None state=1 done=False tr=None # Was this record truncated? while not buf.startswith(b'\r\n',bp): eol=buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): length=wl=int(buf[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): tr=l[bp+16:eol-2] tr="EMPTY" if tr=="" else tr elif buf.startswith(b'WARC-Type: ',bp): wtype = bytes(buf[bp+11:eol-2]) bp=eol bp=eol+2 if (wtype in types): if whole: pass # buf[bp:(bp:=bp+ln)]=l elif (parts & 1): print('cb') OUT=callback(wtype,buf[bob:eol],1) if parts!=1: # everything from bv= goes here # need to read more if eol+length>hdrMax pass print(wtype,bob,eol,length) jumpTo=bp+length buf[0:hdrMax-jumpTo]=buf[jumpTo:hdrMax] stream.seek(fpos:=fpos+jumpTo) if done: continue n=stream.readinto(memoryview(buf)[hdrMax-jumpTo:hdrMax]) print('read',n) if n<jumpTo or n==0: print('done',n,jumpTo) done=True bp=0 #while not buf.startswith(b'\r\n',bp): OUT.write(b"=====\n") OUT.write(buf[0:100]) continue return bv=memoryview(buf)[start_2:start_2+length] ii=0 while True and not stream.closed: if (i:=stream.readinto(bv))==0: break ii+=i if ii>=length: break bv=memoryview(buf)[start_2+ii:start_2+length] if ii!=length: raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) nb+=length if wtype in types: if whole: callback(wtype,buf[0:start_2+length],7) continue # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted bl=None # for HTTP Content-Length for the length of the body? L_start=start_2 state=2 bv=memoryview(buf)[start_2:start_2+length] with io.BytesIO(bv) as rec_text: for L in rec_text: if state==2: # HTTP header wl -= len(L) if not (L==b"" or L.startswith(b"\r")): # Non-empty, it's (a continuation of) a header if bl is None and L.startswith(b"Content-Length: "): bl=int(L[16:].rstrip()) else: # Blank line, HTTP header is finished if parts & 2: callback(wtype,buf[start_2:start_2+L_start],2) state=4 # The above is just for sanity, because we do _not_ # continue with the outer loop, # since we can now block-output the entire rest of the # input buffer. if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) # HTTP body balance=start_2+rec_text.tell() #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) # Output whatever is left if parts & 4: callback(wtype,buf[balance:balance+wl],4) state=1 L_start=rec_text.tell()