Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 111:3119bca71181
warc and headers parts working
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 26 Apr 2021 15:28:23 +0000 |
parents | f148c2366faa |
children | 6467024cd072 |
comparison
equal
deleted
inserted
replaced
110:f148c2366faa | 111:3119bca71181 |
---|---|
37 l=infile.readinto(buf) | 37 l=infile.readinto(buf) |
38 if l==0: | 38 if l==0: |
39 break | 39 break |
40 outfile.write(memoryview(buf)[:l]) | 40 outfile.write(memoryview(buf)[:l]) |
41 file=open(rfn,'rb',0) | 41 file=open(rfn,'rb',0) |
42 if whole: | 42 file.seek(offset) |
43 # try external unzip using Popen | 43 bv=memoryview(buf)[:length] |
44 file.seek(offset) | 44 nb=file.readinto(bv) |
45 bv=memoryview(buf)[:length] | 45 file.close() |
46 nb=file.readinto(bv) | 46 if nb!=length: |
47 file.close() | 47 print("losing",file.name,length,nb,file=sys.stderr) |
48 if nb!=length: | 48 if whole and options.zipped: |
49 print("losing",file.name,length,nb,file=sys.stderr) | 49 BINOUT.write(bv) |
50 if options.zipped: | 50 return |
51 BINOUT.write(bv) | 51 gzip_chunk = io.BytesIO(bv) |
52 else: | 52 uv=memoryview(buf)[length:] |
53 gzip_chunk = io.BytesIO(bv) | 53 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: |
54 uv=memoryview(buf)[length:] | 54 ll=0 |
55 #clear_bytes=io.BytesIO(uv) | 55 while True: |
56 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: | 56 l=gzip_fin.readinto(uv) |
57 while True: | 57 if not l: |
58 l=gzip_fin.readinto(uv) | 58 break |
59 if not l: | 59 ll+=l |
60 break | 60 cb=memoryview(uv)[:ll] |
61 BINOUT.write(memoryview(uv)[:l]) | 61 if whole: |
62 BINOUT.write(cb) | |
63 return | |
64 # only parts wanted | |
65 # Note that _unlike the above_ this strips the ^M from the output lines | |
66 # so we are _not_ idempotent | |
67 state=0 | |
68 tr=None | |
69 with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', | |
70 newline='\r\n') as clear_text: | |
71 for L in clear_text: | |
72 if state==0: | |
73 # WARC header | |
74 if L.startswith("Content-Length: "): | |
75 wl=int(L[16:].rstrip()) | |
76 elif L.startswith("WARC-Truncated: "): | |
77 tr=L[16:].rstrip() | |
78 tr="EMPTY" if tr=="" else tr | |
79 elif L.startswith("\r"): # make us idempotent | |
80 if not (options.headers or options.body): | |
81 return | |
82 state=1 | |
83 bl=None | |
84 if options.warc: | |
85 # preserve the empty line | |
86 print() | |
87 continue | |
88 if options.warc: | |
89 print(L.rstrip()) | |
90 continue | |
91 if state==1: | |
92 # HTTP header | |
93 wl -= len(L) | |
94 if L.startswith("Content-Length: "): | |
95 bl=int(L[16:].rstrip()) | |
96 elif L=="" or L.startswith("\r"): | |
97 if not options.body: | |
98 return | |
99 state=2 | |
100 if options.headers: | |
101 # preserve the empty line | |
102 print() | |
103 if bl is not None: | |
104 if bl!=wl: | |
105 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ | |
106 (length,offset,filename,wl,bl,tr),file=sys.stderr) | |
107 continue | |
108 if options.headers: | |
109 print(L.rstrip()) | |
110 continue | |
111 # HTTP body | |
112 if options.body: | |
113 sys.stdout.flush() | |
114 BINOUT.write(cb[clear_text.tell():]) | |
115 return | |
62 | 116 |
63 def main(): | 117 def main(): |
64 parser = argparse.ArgumentParser( | 118 parser = argparse.ArgumentParser( |
65 description='''Extract records from warc files given length, offset and file triples. | 119 description='''Extract records from warc files given length, offset and file triples. |
66 Input one triple on command line, or | 120 Input one triple on command line, or |