Mercurial > hg > cc > cirrus_home
changeset 176:97137f5bbe0f
working, about to move to work tree
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 05 Jul 2023 14:50:00 +0100 |
parents | d123ef7fdb82 |
children | 354dae8aeb80 |
files | bin/warc.py |
diffstat | 1 files changed, 37 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Mon Jul 03 18:16:14 2023 +0100 +++ b/bin/warc.py Wed Jul 05 14:50:00 2023 +0100 @@ -6,18 +6,20 @@ if (debug:=(sys.argv[1]=='-d')): sys.argv.pop(1) -def warc(callback,types=['response'],parts=7): +def warc(callback,types=['response'],whole=False,parts=7): types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] nb=0 stream=open(sys.argv[1],'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' - while True: + while not stream.closed: bp=0 while l==b'\r\n': l=stream.readline() nb+=(ln:=len(l)) + if ln==0: + break if l!=b'WARC/1.0\r\n': raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), l.decode('latin-1'),len(l))) @@ -26,11 +28,11 @@ state=1 tr=None # Was this record truncated? while l!=b'\r\n': + # WARC header if parts & 1: buf[bp:(bp:=bp+ln)]=l l=stream.readline() nb+=(ln:=len(l)) - # WARC header if l.startswith(b"Content-Length: "): length=wl=int(l[16:].rstrip()) elif l.startswith(b"WARC-Truncated: "): @@ -39,12 +41,15 @@ elif l.startswith(b'WARC-Type: '): wtype = l[11:-2] start_2=bp - if (wtype in types) and (parts & 1): - if parts!=1: + if (wtype in types): + if whole: buf[bp:(bp:=bp+ln)]=l + elif (parts & 1): + callback(wtype,buf[:start_2],1) + if parts==1: + start_2=0 + else: start_2=bp - if parts!=7: - callback(wtype,buf[:start_2],1) else: start_2=0 bv=memoryview(buf)[start_2:start_2+length] @@ -59,15 +64,15 @@ if ii!=length: raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) nb+=length - bv=memoryview(buf)[start_2:start_2+length] if wtype in types: - if parts==7: - callback(wtype,memoryview(buf)[0:start_2+length],7) + if whole: + callback(wtype,buf[0:start_2+length],7) continue # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted bl=None # for HTTP Content-Length for the length of the body? - L_start=0 + L_start=start_2 state=2 + bv=memoryview(buf)[start_2:start_2+length] with io.BytesIO(bv) as rec_text: for L in rec_text: if state==2: @@ -80,7 +85,7 @@ else: # Blank line, HTTP header is finished if parts & 2: - callback(wtype,bv[start_2:L_start],2) + callback(wtype,buf[start_2:start_2+L_start],2) state=4 # The above is just for sanity, because we do _not_ # continue with the outer loop, @@ -91,32 +96,43 @@ print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ (length,offset,filename,wl,bl,tr),file=sys.stderr) # HTTP body - balance=rec_text.tell() + balance=start_2+rec_text.tell() #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) # Output whatever is left if parts & 4: - callback(wtype,bv[balance:balance+wl],4) + callback(wtype,buf[balance:balance+wl],4) state=1 L_start=rec_text.tell() OUT=open(sys.stdout.fileno(),'wb') import re +TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE) LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) -def showmeLMH(wtype,buf,part=2): - m=LMPAT.search(buf.tobytes(order='A')) - if m: - OUT.write(m[1]) - OUT.write(b'\n') +def showmeLMH(wtype,buf,part): + global URI + if part==1: + if (m:=TUPAT.search(buf)): + URI=m[1] + else: + raise ValueError(b"No target URI in %s ??"%buf) + else: + m=LMPAT.search(buf) + OUT.write(URI) + if m: + OUT.write(b'\t') + OUT.write(m[1]) + OUT.write(b'\n') def showme(wtype,buf,part): if debug: breakpoint() OUT.write(b"%d\n%b"%(part,buf)) -#warc(showmeLMH,[b'response'],2) +warc(showmeLMH,[b'response'],parts=3) #warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) -warc(showme,[b'response'],int(sys.argv[2])) +#warc(showme,[b'response'],parts=int(sys.argv[2])) +#warc(showme,[b'response'],whole=True)