Mercurial > hg > cc > cirrus_work
comparison bin/warc.py @ 53:0dc144bd027c
made 1 mean 1, still losing after a while
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 07 Jul 2023 19:04:16 +0100 |
parents | c0b4359dd26a |
children | 9c63039a9b6d |
comparison
equal
deleted
inserted
replaced
52:e20c64917805 | 53:0dc144bd027c |
---|---|
16 bufSize=2*1024*1024 | 16 bufSize=2*1024*1024 |
17 hdrMax=16*1024 | 17 hdrMax=16*1024 |
18 buf=bytearray(bufSize) | 18 buf=bytearray(bufSize) |
19 hdrBuf=memoryview(buf)[:hdrMax] | 19 hdrBuf=memoryview(buf)[:hdrMax] |
20 fpos=bl=stream.readinto(hdrBuf) | 20 fpos=bl=stream.readinto(hdrBuf) |
21 bob=0 | |
22 while True: | 21 while True: |
23 bp=0 | 22 bp=0 |
24 while buf.startswith(b'\r\n',bp): | 23 while buf.startswith(b'\r\n',bp): |
25 bp+=2 | 24 bp+=2 |
25 bob=bp | |
26 if not buf.startswith(b'WARC/1.0\r\n',bp): | 26 if not buf.startswith(b'WARC/1.0\r\n',bp): |
27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, | 27 raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp, |
28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) | 28 buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp)) |
29 bp+=10 | 29 bp+=10 |
30 wtype=None | 30 wtype=None |
46 if (wtype in types): | 46 if (wtype in types): |
47 if whole: | 47 if whole: |
48 pass # buf[bp:(bp:=bp+ln)]=l | 48 pass # buf[bp:(bp:=bp+ln)]=l |
49 elif (parts & 1): | 49 elif (parts & 1): |
50 print('cb') | 50 print('cb') |
51 OUT=callback(wtype,buf[bp:eol+length],1) | 51 OUT=callback(wtype,buf[bob:eol],1) |
52 if parts!=1: | 52 if parts!=1: |
53 # everything from bv= goes here | 53 # everything from bv= goes here |
54 # need to read more if eol+length>hdrMax | 54 # need to read more if eol+length>hdrMax |
55 pass | 55 pass |
56 print(wtype,bob,eol,length) | 56 print(wtype,bob,eol,length) |