Mercurial > hg > cc > cirrus_home
comparison bin/ix.py @ 114:6467024cd072
all parts working, idempotency achieved
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 26 Apr 2021 17:18:29 +0000 |
parents | 3119bca71181 |
children | 63898fde9751 |
comparison
equal
deleted
inserted
replaced
113:1d6fde73789d | 114:6467024cd072 |
---|---|
60 cb=memoryview(uv)[:ll] | 60 cb=memoryview(uv)[:ll] |
61 if whole: | 61 if whole: |
62 BINOUT.write(cb) | 62 BINOUT.write(cb) |
63 return | 63 return |
64 # only parts wanted | 64 # only parts wanted |
65 # Note that _unlike the above_ this strips the ^M from the output lines | |
66 # so we are _not_ idempotent | |
67 state=0 | 65 state=0 |
68 tr=None | 66 tr=None |
69 with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', | 67 with io.BytesIO(cb) as clear_text: |
70 newline='\r\n') as clear_text: | |
71 for L in clear_text: | 68 for L in clear_text: |
72 if state==0: | 69 if state==0: |
73 # WARC header | 70 # WARC header |
74 if L.startswith("Content-Length: "): | 71 if L.startswith(b"Content-Length: "): |
75 wl=int(L[16:].rstrip()) | 72 wl=int(L[16:].rstrip()) |
76 elif L.startswith("WARC-Truncated: "): | 73 elif L.startswith(b"WARC-Truncated: "): |
77 tr=L[16:].rstrip() | 74 tr=L[16:].rstrip() |
78 tr="EMPTY" if tr=="" else tr | 75 tr="EMPTY" if tr=="" else tr |
79 elif L.startswith("\r"): # make us idempotent | 76 elif L=='' or L.startswith(b"\r"): # for idempotency |
80 if not (options.headers or options.body): | 77 if not (options.headers or options.body): |
81 return | 78 return |
82 state=1 | 79 state=1 |
83 bl=None | 80 bl=None |
84 if options.warc: | 81 # Note we preserve the empty line |
85 # preserve the empty line | |
86 print() | |
87 continue | |
88 if options.warc: | 82 if options.warc: |
89 print(L.rstrip()) | 83 BINOUT.write(L) |
90 continue | 84 continue |
91 if state==1: | 85 if state==1: |
92 # HTTP header | 86 # HTTP header |
93 wl -= len(L) | 87 wl -= len(L) |
94 if L.startswith("Content-Length: "): | 88 if L.startswith(b"Content-Length: "): |
95 bl=int(L[16:].rstrip()) | 89 bl=int(L[16:].rstrip()) |
96 elif L=="" or L.startswith("\r"): | 90 elif L==b"" or L.startswith(b"\r"): |
97 if not options.body: | 91 if not options.body: |
98 return | 92 return |
99 state=2 | 93 state=2 |
100 if options.headers: | |
101 # preserve the empty line | |
102 print() | |
103 if bl is not None: | 94 if bl is not None: |
104 if bl!=wl: | 95 if bl!=wl: |
105 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ | 96 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
106 (length,offset,filename,wl,bl,tr),file=sys.stderr) | 97 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
107 continue | 98 # HTTP body |
99 if options.body: | |
100 balance=clear_text.tell() | |
101 # Go this line with whatever is left in the buffer... | |
102 BINOUT.write(cb[balance-2:]) | |
103 return | |
108 if options.headers: | 104 if options.headers: |
109 print(L.rstrip()) | 105 BINOUT.write(L) |
110 continue | |
111 # HTTP body | |
112 if options.body: | |
113 sys.stdout.flush() | |
114 BINOUT.write(cb[clear_text.tell():]) | |
115 return | |
116 | 106 |
117 def main(): | 107 def main(): |
118 parser = argparse.ArgumentParser( | 108 parser = argparse.ArgumentParser( |
119 description='''Extract records from warc files given length, offset and file triples. | 109 description='''Extract records from warc files given length, offset and file triples. |
120 Input one triple on command line, or | 110 Input one triple on command line, or |