comparison bin/warc.py @ 175:d123ef7fdb82

working on implementing types and parts: 1, 2, 4 working, 3 not
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 03 Jul 2023 18:16:14 +0100
parents e96d444b0f84
children 97137f5bbe0f
comparison
equal deleted inserted replaced
174:bfe9085a1d39 175:d123ef7fdb82
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Stream a warc format file, invoking a callback on each part. 2 '''Stream a warc format file, invoking a callback on each record.
3 Callback can be limited by WARC-Type''' 3 Callback can be limited by WARC-Type, record part'''
4 import sys,os 4 import sys,os,io
5 5
6 def warc(callback,types=['response']): 6 if (debug:=(sys.argv[1]=='-d')):
7 sys.argv.pop(1)
8
9 def warc(callback,types=['response'],parts=7):
10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
7 nb=0 11 nb=0
8 stream=open(sys.argv[1],'rb',0) 12 stream=open(sys.argv[1],'rb',0)
9 bufsize=128*1024*1024 13 bufsize=128*1024*1024
10 buf=bytearray(128*1024*1024) 14 buf=bytearray(128*1024*1024)
11 l=b'\r\n' 15 l=b'\r\n'
12 while True: 16 while True:
17 bp=0
13 while l==b'\r\n': 18 while l==b'\r\n':
14 l=stream.readline() 19 l=stream.readline()
15 nb+=len(l) 20 nb+=(ln:=len(l))
16 if l!=b'WARC/1.0\r\n': 21 if l!=b'WARC/1.0\r\n':
17 if l==b'':
18 return
19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), 22 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
20 l.decode('latin-1'),len(l))) 23 l.decode('latin-1'),len(l)))
21 wtype=None 24 wtype=None
22 length=None 25 length=None
26 state=1
27 tr=None # Was this record truncated?
23 while l!=b'\r\n': 28 while l!=b'\r\n':
29 if parts & 1:
30 buf[bp:(bp:=bp+ln)]=l
24 l=stream.readline() 31 l=stream.readline()
25 nb+=len(l) 32 nb+=(ln:=len(l))
26 if l.startswith(b'WARC-Type: '): 33 # WARC header
34 if l.startswith(b"Content-Length: "):
35 length=wl=int(l[16:].rstrip())
36 elif l.startswith(b"WARC-Truncated: "):
37 tr=l[16:].rstrip()
38 tr="EMPTY" if tr=="" else tr
39 elif l.startswith(b'WARC-Type: '):
27 wtype = l[11:-2] 40 wtype = l[11:-2]
28 elif l.startswith(b'Content-Length: '): 41 start_2=bp
29 length = int(l[16:]) 42 if (wtype in types) and (parts & 1):
30 bv=memoryview(buf)[:length] 43 if parts!=1:
44 buf[bp:(bp:=bp+ln)]=l
45 start_2=bp
46 if parts!=7:
47 callback(wtype,buf[:start_2],1)
48 else:
49 start_2=0
50 bv=memoryview(buf)[start_2:start_2+length]
31 ii=0 51 ii=0
32 while True: 52 while True and not stream.closed:
33 i=stream.readinto(bv) 53 if (i:=stream.readinto(bv))==0:
54 break
34 ii+=i 55 ii+=i
35 if ii>=length: 56 if ii>=length:
36 break 57 break
37 bv=memoryview(buf)[ii:length] 58 bv=memoryview(buf)[start_2+ii:start_2+length]
38 if ii!=length: 59 if ii!=length:
39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) 60 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length))
40 nb+=length 61 nb+=length
62 bv=memoryview(buf)[start_2:start_2+length]
41 if wtype in types: 63 if wtype in types:
42 callback(wtype,memoryview(buf[:length])) 64 if parts==7:
43 if whole and options.zipped: 65 callback(wtype,memoryview(buf)[0:start_2+length],7)
44 _output(bv)
45 return
46 gzip_chunk = io.BytesIO(bv)
47 uv=memoryview(buf)[length:]
48 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin:
49 ll=0
50 while True:
51 l=gzip_fin.readinto(uv)
52 if not l:
53 break
54 ll+=l
55 cb=memoryview(uv)[:ll]
56 if whole:
57 _output(cb)
58 return
59 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted
60 state=0
61 tr=None # Was this record truncated?
62 bl=None # for HTTP Content-Length for the length of the body?
63 with io.BytesIO(cb) as clear_text:
64 for L in clear_text:
65 if state==0:
66 # WARC header
67 if L.startswith(b"Content-Length: "):
68 wl=int(L[16:].rstrip())
69 elif L.startswith(b"WARC-Truncated: "):
70 tr=L[16:].rstrip()
71 tr="EMPTY" if tr=="" else tr
72 elif L==b"" or L.startswith(b"\r"): # for idempotency
73 # Blank line, WARC header is finished
74 if not (options.headers or options.body):
75 return
76 state=1
77 # Note we preserve the empty line
78 if options.warc:
79 _output(L)
80 continue 66 continue
81 if state==1: 67 # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted
82 # HTTP header 68 bl=None # for HTTP Content-Length for the length of the body?
83 wl -= len(L) 69 L_start=0
84 if not (L==b"" or L.startswith(b"\r")): 70 state=2
85 # Non-blank, it's a header 71 with io.BytesIO(bv) as rec_text:
86 if bl is None and L.startswith(b"Content-Length: "): 72 for L in rec_text:
87 bl=int(L[16:].rstrip()) 73 if state==2:
88 if options.headers: 74 # HTTP header
89 _output(L) 75 wl -= len(L)
90 else: 76 if not (L==b"" or L.startswith(b"\r")):
91 # Blank line, HTTP header is finished 77 # Non-empty, it's (a continuation of) a header
92 if not options.body: 78 if bl is None and L.startswith(b"Content-Length: "):
93 return 79 bl=int(L[16:].rstrip())
94 if options.headers: 80 else:
95 _output(L) 81 # Blank line, HTTP header is finished
96 state=2 82 if parts & 2:
97 # The above is just for sanity, because we do _not_ 83 callback(wtype,bv[start_2:L_start],2)
98 # continue with the outer loop, 84 state=4
99 # since we can now block-output the entire rest of the 85 # The above is just for sanity, because we do _not_
100 # input buffer. 86 # continue with the outer loop,
101 if bl is not None: 87 # since we can now block-output the entire rest of the
102 if bl!=wl: 88 # input buffer.
103 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ 89 if bl is not None:
104 (length,offset,filename,wl,bl,tr),file=sys.stderr) 90 if bl!=wl:
105 # HTTP body 91 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\
106 balance=clear_text.tell() 92 (length,offset,filename,wl,bl,tr),file=sys.stderr)
107 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) 93 # HTTP body
108 # Output whatever is left 94 balance=rec_text.tell()
109 _output(cb[balance:balance+wl]) 95 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr)
110 return 96 # Output whatever is left
111 97 if parts & 4:
98 callback(wtype,bv[balance:balance+wl],4)
99 state=1
100
101 L_start=rec_text.tell()
112 OUT=open(sys.stdout.fileno(),'wb') 102 OUT=open(sys.stdout.fileno(),'wb')
113 103
114 import re 104 import re
115 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) 105 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
116 106
117 def showmeLMH(wtype,buf): 107 def showmeLMH(wtype,buf,part=2):
118 m=LMPAT.search(buf.tobytes(order='A')) 108 m=LMPAT.search(buf.tobytes(order='A'))
119 if m: 109 if m:
120 OUT.write(m[1]) 110 OUT.write(m[1])
121 OUT.write(b'\n') 111 OUT.write(b'\n')
122 112
123 def showme(wtype,buf): 113 def showme(wtype,buf,part):
124 OUT.write(buf) 114 if debug:
115 breakpoint()
116 OUT.write(b"%d\n%b"%(part,buf))
125 117
126 warc(showmeLMH,[b'response']) 118 #warc(showmeLMH,[b'response'],2)
119
120 #warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
121
122 warc(showme,[b'response'],int(sys.argv[2]))