Mercurial > hg > cc > cirrus_work
changeset 244:ce5b2c1da222
working together works well to provide what's needed to update a cdx to include lastmod where possible
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 03 Oct 2024 18:17:55 +0100 |
parents | 7bef91ca3d51 |
children | 1d6fe71f13f4 |
files | lib/python/cc/cdx_extras.py lib/python/cc/unpackz.py |
diffstat | 2 files changed, 36 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/lib/python/cc/cdx_extras.py Thu Oct 03 18:17:55 2024 +0100 @@ -0,0 +1,27 @@ +#!/usr/bin/python3 +'''Output the necessary for updating a segment index file with lastmod''' +import sys, re + +import unpackz + +PAT = re.compile(b'\r\nLast-Modified: ([^\r]*)',re.MULTILINE) + +def whereandwhat(count,offset,data,outfile): + '''Callback from unpackz to extract what we need''' + #if offset == 345053206: + # breakpoint() + if data.startswith(b'WARC/1.0\r\nWARC-Type: response\r\n') and \ + b'Last-Modified: ' in data: # included for speed (worth factor of 3) + if (http_start := data.find(b'\r\nHTTP/')) == -1: + print("missing HTTP at %s %s"%(count,offset), + file=sys.stderr) + return + if (http_end := data.find(b'\r\n\r\n',http_start)) == -1: + print("missing end of HTTP at %s %s"%(count,offset), + file=sys.stderr) + return + if (m:=PAT.search(data,http_start,http_end)): + print(count,offset,m[1].decode('UTF-8'),sep='\t') + +unpackz.unpackz(sys.argv[1],whereandwhat) +
--- a/lib/python/cc/unpackz.py Wed Oct 02 19:54:45 2024 +0100 +++ b/lib/python/cc/unpackz.py Thu Oct 03 18:17:55 2024 +0100 @@ -13,18 +13,21 @@ with open(infileName,'rb') as f: z = isal.isal_zlib.decompressobj(31) count = 0 - got = None # Keep the compiler happy + prev_buf = buf = got = None # Keep the compiler happy + ogot = None while True: if z.unused_data == b"": - #print('n', obuf_len, file=sys.stderr) + #print('n', obuf_len, file=sys.stderr) + if ogot is not None: + ogot = ogot + got + else: + ogot = got if lastbuf: # buf == b"": callback(obuf_len, offset, got, outfile) if count!=0: print("Unused data: count=%s offset=%s ?"%(count, offset), file=sys.stderr) break - if nbuf: - obuf_len += BUFSIZE # still no EOS after a full buffer processed buf = f.read(BUFSIZE) nbuf = True lastbuf = ((truesize:=len(buf)) < BUFSIZE) # will only succeed if now at EOF @@ -36,7 +39,8 @@ (len(buf)-len(z.unused_data)) #if (offset == 1352249): # breakpoint() - callback(count, offset, got, outfile) + callback(count, offset, got if ogot is None else ogot + got, outfile) + ogot = None offset += count count = 0 buf = z.unused_data