comparison bin/ix.py @ 114:6467024cd072

all parts working, idempotency achieved
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 26 Apr 2021 17:18:29 +0000
parents 3119bca71181
children 63898fde9751
comparison
equal deleted inserted replaced
113:1d6fde73789d 114:6467024cd072
60 cb=memoryview(uv)[:ll] 60 cb=memoryview(uv)[:ll]
61 if whole: 61 if whole:
62 BINOUT.write(cb) 62 BINOUT.write(cb)
63 return 63 return
64 # only parts wanted 64 # only parts wanted
65 # Note that _unlike the above_ this strips the ^M from the output lines
66 # so we are _not_ idempotent
67 state=0 65 state=0
68 tr=None 66 tr=None
69 with io.TextIOWrapper(io.BytesIO(cb),encoding='iso-8859-1', 67 with io.BytesIO(cb) as clear_text:
70 newline='\r\n') as clear_text:
71 for L in clear_text: 68 for L in clear_text:
72 if state==0: 69 if state==0:
73 # WARC header 70 # WARC header
74 if L.startswith("Content-Length: "): 71 if L.startswith(b"Content-Length: "):
75 wl=int(L[16:].rstrip()) 72 wl=int(L[16:].rstrip())
76 elif L.startswith("WARC-Truncated: "): 73 elif L.startswith(b"WARC-Truncated: "):
77 tr=L[16:].rstrip() 74 tr=L[16:].rstrip()
78 tr="EMPTY" if tr=="" else tr 75 tr="EMPTY" if tr=="" else tr
79 elif L.startswith("\r"): # make us idempotent 76 elif L=='' or L.startswith(b"\r"): # for idempotency
80 if not (options.headers or options.body): 77 if not (options.headers or options.body):
81 return 78 return
82 state=1 79 state=1
83 bl=None 80 bl=None
84 if options.warc: 81 # Note we preserve the empty line
85 # preserve the empty line
86 print()
87 continue
88 if options.warc: 82 if options.warc:
89 print(L.rstrip()) 83 BINOUT.write(L)
90 continue 84 continue
91 if state==1: 85 if state==1:
92 # HTTP header 86 # HTTP header
93 wl -= len(L) 87 wl -= len(L)
94 if L.startswith("Content-Length: "): 88 if L.startswith(b"Content-Length: "):
95 bl=int(L[16:].rstrip()) 89 bl=int(L[16:].rstrip())
96 elif L=="" or L.startswith("\r"): 90 elif L==b"" or L.startswith(b"\r"):
97 if not options.body: 91 if not options.body:
98 return 92 return
99 state=2 93 state=2
100 if options.headers:
101 # preserve the empty line
102 print()
103 if bl is not None: 94 if bl is not None:
104 if bl!=wl: 95 if bl!=wl:
105 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ 96 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
106 (length,offset,filename,wl,bl,tr),file=sys.stderr) 97 (length,offset,filename,wl,bl,tr),file=sys.stderr)
107 continue 98 # HTTP body
99 if options.body:
100 balance=clear_text.tell()
101 # Go this line with whatever is left in the buffer...
102 BINOUT.write(cb[balance-2:])
103 return
108 if options.headers: 104 if options.headers:
109 print(L.rstrip()) 105 BINOUT.write(L)
110 continue
111 # HTTP body
112 if options.body:
113 sys.stdout.flush()
114 BINOUT.write(cb[clear_text.tell():])
115 return
116 106
117 def main(): 107 def main():
118 parser = argparse.ArgumentParser( 108 parser = argparse.ArgumentParser(
119 description='''Extract records from warc files given length, offset and file triples. 109 description='''Extract records from warc files given length, offset and file triples.
120 Input one triple on command line, or 110 Input one triple on command line, or