Mercurial > hg > cc > cirrus_home
changeset 139:e96d444b0f84
fixed bug(s) wrt large payload files
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 23 Jul 2021 22:19:15 +0000 |
parents | 9ea12f7b304b |
children | 0a447db5cf1c |
files | bin/warc.py |
diffstat | 1 files changed, 14 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/warc.py Fri Jul 23 16:23:46 2021 +0000 +++ b/bin/warc.py Fri Jul 23 22:19:15 2021 +0000 @@ -1,11 +1,11 @@ #!/usr/bin/env python3 '''Stream a warc format file, invoking a callback on each part. Callback can be limited by WARC-Type''' -import sys +import sys,os def warc(callback,types=['response']): nb=0 - stream=open(sys.stdin.fileno(),'rb',0) + stream=open(sys.argv[1],'rb',0) bufsize=128*1024*1024 buf=bytearray(128*1024*1024) l=b'\r\n' @@ -14,7 +14,7 @@ l=stream.readline() nb+=len(l) if l!=b'WARC/1.0\r\n': - if l==0: + if l==b'': return raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), l.decode('latin-1'),len(l))) @@ -39,7 +39,7 @@ raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) nb+=length if wtype in types: - callback(wtype,bv) + callback(wtype,memoryview(buf[:length])) if whole and options.zipped: _output(bv) return @@ -111,7 +111,16 @@ OUT=open(sys.stdout.fileno(),'wb') +import re +LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) + +def showmeLMH(wtype,buf): + m=LMPAT.search(buf.tobytes(order='A')) + if m: + OUT.write(m[1]) + OUT.write(b'\n') + def showme(wtype,buf): OUT.write(buf) -warc(showme,[b'metadata']) +warc(showmeLMH,[b'response'])