Mercurial > hg > cc > cirrus_work
changeset 283:6739e08d19ff
type decls, cythonize works
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Mar 2025 18:15:41 +0000 |
parents | 0267374361f4 |
children | e461601592dd |
files | lib/python/cc/warc.py |
diffstat | 1 files changed, 21 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/lib/python/cc/warc.py Fri Mar 07 15:39:36 2025 +0000 +++ b/lib/python/cc/warc.py Fri Mar 07 18:15:41 2025 +0000 @@ -5,29 +5,35 @@ import sys, io from isal import igzip +import cython, typing -RESP = b'response' -REQ = b'request' -META = b'metadata' -INFO = b'warcinfo' + +RESP: cython.bytes = b'response' +REQ: cython.bytes = b'request' +META: cython.bytes = b'metadata' +INFO: cython.bytes = b'warcinfo' -BUFSIZE=2*1024*1024 -HDRMAX=32*1024 # Not really max, there are some enormous ones, see below +BUFSIZE: int = 2*1024*1024 +HDRMAX: int = 32*1024 # Not really max, there are some enormous ones, see below -def refill(buf, bufView, stream, start_1, bl, bp, eol, length, needed): +def refill(buf: char[::1], bufView: char[::1], stream: typing.BinaryIO, + start_1: int, bl: int, bp: int, eol: int, + length: int, needed: bool) -> (int, int, int, cython.bytes, int, char[::1], bool): global BUFSIZE + whole: int + xBuf: char[::1] #if (stream.tell() > 2381000000): # breakpoint() if needed: # we need to keep from start_1 to bl - keepFrom=start_1 - keepLen=bl-keepFrom + keepFrom: int = start_1 + keepLen: int = bl-keepFrom if (whole:=((bp-start_1)+length)) > BUFSIZE: while whole > BUFSIZE: # Need a bigger buffer print('Growing buffer %s > %s'%(whole,BUFSIZE),file=sys.stderr) BUFSIZE=BUFSIZE+(64 * 1024) - newbuf = bytearray(BUFSIZE) + newbuf: char[::1] = bytearray(BUFSIZE) newbuf[0:keepLen]=bufView[keepFrom:bl] bl = BUFSIZE buf = newbuf @@ -49,7 +55,7 @@ keepLen=0 stream.seek(stream.tell() + bp + length - bl) bp=0 - spaceToFill=BUFSIZE-keepLen + spaceToFill: int = BUFSIZE-keepLen with memoryview(buf)[keepLen:BUFSIZE] as xBuf: nb=stream.readinto(xBuf) bl=keepLen+nb @@ -166,6 +172,8 @@ eo2=buf.index(b'\r\n\r\n',start_2) OUT=callback(wtype,bufView[start_2:eo2+2],2) if parts & 4: + # stale below here??? + rec_text = [] for L in rec_text: if state==2: # HTTP header @@ -186,7 +194,8 @@ if bl is not None: if bl!=wl: print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ - (length,offset,filename,wl,bl,tr),file=sys.stderr) + (length,#offset, + filename,wl,bl,tr),file=sys.stderr) # HTTP body balance=start_2+rec_text.tell() #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)