Mercurial > hg > cc > cirrus_work
view bin/warc.py @ 109:52c6a9b0fc8c
loosen must-match criterion in the both-messy case
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 19 Sep 2023 19:29:41 +0100 |
parents | b8d4a5ede7a3 |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Stream a warc format file, unzipping if necessary, invoking a callback on each record. Callback can be limited by WARC-Type, record part''' import sys,io from isal import igzip RESP = b'response' REQ = b'request' META = b'metadata' INFO = b'warcinfo' def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False): '''parts is a bit-mask: 1 for warc header; 2 for req/resp HTTP header, warcinfo/metadata features; 4 for req/resp body''' # should do some sanity checking wrt parts and types types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 if filename.endswith(".gz"): stream=igzip.IGzipFile(filename=filename) else: stream=open(filename,'rb',0) bufSize=2*1024*1024 hdrMax=16*1024 buf=bytearray(bufSize) bufView=memoryview(buf) fpos=bl=stream.readinto(buf) bp=0 done=False while True: while buf.startswith(b'\r\n',bp,bl): # will Fail if buffer (nearly) empty bp+=2 start_1=bp if not buf.startswith(b'WARC/1.0\r\n',bp): if done and bl-bp==0: # really done return raise ValueError("Not a WARC file? In %s at %s of %s (%s): %s[%s]"%(filename, bp,bl,fpos, (buf[bp:min(bl,bp+20)] if bp<bl else buf[bl-20:bl]).decode('latin-1'), bl-bp)) bp+=10 wtype=None length=None state=1 tr=None # Was this record truncated? while not buf.startswith(b'\r\n',bp): # there should always be enough in the buffer to complete this loop, # because of the buffer update logic below eol=buf.index(b'\r\n',bp)+2 if buf.startswith(b"Content-Length: ",bp): length=wl=int(bufView[bp+16:eol-2]) elif buf.startswith(b"WARC-Truncated: ",bp): if bp+16==eol-2: tr=b"EMPTY" else: tr=bytes(bufView[bp+16:eol-2]) elif buf.startswith(b'WARC-Type: ',bp): if buf.startswith(b's',bp+13): wtype = RESP elif buf.startswith(b'q',bp+13): wtype = REQ elif buf.startswith(b'm',bp+11): wtype = META elif buf.startswith(b'w',bp+11): wtype = INFO else: raise ValueError("Unknown WARC-Type: %s in %s at %s"%( bytes(bufView[bp+11:eol-2]),filename, fpos-(bl-bp))) bp=eol bp=eol+2 if done: if (bp+length)>bl: raise ValueError("Done but need more! %s + %s > %s in %s"%(bp, length,bl,filename)) elif (bp+(length+hdrMax))>bl: # Need more data if wtype in types: # we need to keep from start_1 to bl keepFrom=start_1 keepLen=bl-keepFrom buf[0:keepLen]=bufView[keepFrom:bl] eol=eol-start_1 start_1=0 bp=eol+2 else: # we can skip the rest of this part if (bp+length)<=bl: # we have at least some bytes from the next part keepLen=bl-(bp+length) buf[0:keepLen]=bufView[bl-keepLen:bl] else: # we don't have all of the bytes from the current part # so can skip the rest of it keepLen=0 fpos=stream.seek(fpos+(bp+length-bl)) bp=0 spaceToFill=bufSize-keepLen with memoryview(buf)[keepLen:bufSize] as xBuf: nb=stream.readinto(xBuf) fpos+=nb bl=keepLen+nb if nb<spaceToFill: done=True if wtype not in types: continue if (wtype in types): # Output whole or part 1 as required if whole: bp+=length OUT=callback(wtype,bufView[start_1:bp],7) continue elif (parts & 1): OUT=callback(wtype,bufView[start_1:eol],1) if parts!=1: while buf.startswith(b'\r\n',bp): bp+=2 start_2=bp eob=bp+length while buf.startswith(b'\r\n',eob-2): eob-=2 # Only output parts (2 = HTTP header, 4 = body) that are wanted if parts & 2: if wtype is META or wtype is INFO: # rest of the part OUT=callback(wtype,bufView[start_2:eob],2) else: # request and response have http headers eo2=buf.index(b'\r\n\r\n',start_2) OUT=callback(wtype,bufView[start_2:eo2+2],2) if parts & 4: for L in rec_text: if state==2: # HTTP header wl -= len(L) if not (L==b"" or L.startswith(b"\r")): # Non-empty, it's (a continuation of) a header if bl is None and L.startswith(b"Content-Length: "): bl=int(L[16:].rstrip()) else: # Blank line, HTTP header is finished if parts & 2: callback(wtype,bufView[start_2:start_2+L_start],2) state=4 # The above is just for sanity, because we do _not_ # continue with the outer loop, # since we can now block-output the entire rest of the # input buffer. if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) # HTTP body balance=start_2+rec_text.tell() #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) # Output whatever is left if parts & 4: callback(wtype,bufView[balance:balance+wl],4) state=1 L_start=rec_text.tell() bp+=length #print('end of loop',wtype,start_1,bp,eol,length,bl,file=sys.stderr)