comparison bin/ix.py @ 111:3119bca71181

warc and headers parts working
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 26 Apr 2021 15:28:23 +0000
parents f148c2366faa
children 6467024cd072
comparison
equal deleted inserted replaced
110:f148c2366faa 111:3119bca71181
37 l=infile.readinto(buf) 37 l=infile.readinto(buf)
38 if l==0: 38 if l==0:
39 break 39 break
40 outfile.write(memoryview(buf)[:l]) 40 outfile.write(memoryview(buf)[:l])
41 file=open(rfn,'rb',0) 41 file=open(rfn,'rb',0)
42 if whole: 42 file.seek(offset)
43 # try external unzip using Popen 43 bv=memoryview(buf)[:length]
44 file.seek(offset) 44 nb=file.readinto(bv)
45 bv=memoryview(buf)[:length] 45 file.close()
46 nb=file.readinto(bv) 46 if nb!=length:
47 file.close() 47 print("losing",file.name,length,nb,file=sys.stderr)
48 if nb!=length: 48 if whole and options.zipped:
49 print("losing",file.name,length,nb,file=sys.stderr) 49 BINOUT.write(bv)
50 if options.zipped: 50 return
51 BINOUT.write(bv) 51 gzip_chunk = io.BytesIO(bv)
52 else: 52 uv=memoryview(buf)[length:]
53 gzip_chunk = io.BytesIO(bv) 53 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
54 uv=memoryview(buf)[length:] 54 ll=0
55 #clear_bytes=io.BytesIO(uv) 55 while True:
56 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: 56 l=gzip_fin.readinto(uv)
57 while True: 57 if not l:
58 l=gzip_fin.readinto(uv) 58 break
59 if not l: 59 ll+=l
60 break 60 cb=memoryview(uv)[:ll]
61 BINOUT.write(memoryview(uv)[:l]) 61 if whole:
62 BINOUT.write(cb)
63 return
64 # only parts wanted
65 # Note that _unlike the above_ this strips the ^M from the output lines
66 # so we are _not_ idempotent
67 state=0
68 tr=None
69 with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1',
70 newline='\r\n') as clear_text:
71 for L in clear_text:
72 if state==0:
73 # WARC header
74 if L.startswith("Content-Length: "):
75 wl=int(L[16:].rstrip())
76 elif L.startswith("WARC-Truncated: "):
77 tr=L[16:].rstrip()
78 tr="EMPTY" if tr=="" else tr
79 elif L.startswith("\r"): # make us idempotent
80 if not (options.headers or options.body):
81 return
82 state=1
83 bl=None
84 if options.warc:
85 # preserve the empty line
86 print()
87 continue
88 if options.warc:
89 print(L.rstrip())
90 continue
91 if state==1:
92 # HTTP header
93 wl -= len(L)
94 if L.startswith("Content-Length: "):
95 bl=int(L[16:].rstrip())
96 elif L=="" or L.startswith("\r"):
97 if not options.body:
98 return
99 state=2
100 if options.headers:
101 # preserve the empty line
102 print()
103 if bl is not None:
104 if bl!=wl:
105 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
106 (length,offset,filename,wl,bl,tr),file=sys.stderr)
107 continue
108 if options.headers:
109 print(L.rstrip())
110 continue
111 # HTTP body
112 if options.body:
113 sys.stdout.flush()
114 BINOUT.write(cb[clear_text.tell():])
115 return
62 116
63 def main(): 117 def main():
64 parser = argparse.ArgumentParser( 118 parser = argparse.ArgumentParser(
65 description='''Extract records from warc files given length, offset and file triples. 119 description='''Extract records from warc files given length, offset and file triples.
66 Input one triple on command line, or 120 Input one triple on command line, or