Mercurial > hg > cc > cirrus_home
comparison bin/warc.py @ 175:d123ef7fdb82
working on implementing types and parts:
1, 2, 4 working, 3 not
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 03 Jul 2023 18:16:14 +0100 |
parents | e96d444b0f84 |
children | 97137f5bbe0f |
comparison
equal
deleted
inserted
replaced
174:bfe9085a1d39 | 175:d123ef7fdb82 |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 '''Stream a warc format file, invoking a callback on each part. | 2 '''Stream a warc format file, invoking a callback on each record. |
3 Callback can be limited by WARC-Type''' | 3 Callback can be limited by WARC-Type, record part''' |
4 import sys,os | 4 import sys,os,io |
5 | 5 |
6 def warc(callback,types=['response']): | 6 if (debug:=(sys.argv[1]=='-d')): |
7 sys.argv.pop(1) | |
8 | |
9 def warc(callback,types=['response'],parts=7): | |
10 types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types] | |
7 nb=0 | 11 nb=0 |
8 stream=open(sys.argv[1],'rb',0) | 12 stream=open(sys.argv[1],'rb',0) |
9 bufsize=128*1024*1024 | 13 bufsize=128*1024*1024 |
10 buf=bytearray(128*1024*1024) | 14 buf=bytearray(128*1024*1024) |
11 l=b'\r\n' | 15 l=b'\r\n' |
12 while True: | 16 while True: |
17 bp=0 | |
13 while l==b'\r\n': | 18 while l==b'\r\n': |
14 l=stream.readline() | 19 l=stream.readline() |
15 nb+=len(l) | 20 nb+=(ln:=len(l)) |
16 if l!=b'WARC/1.0\r\n': | 21 if l!=b'WARC/1.0\r\n': |
17 if l==b'': | |
18 return | |
19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), | 22 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), |
20 l.decode('latin-1'),len(l))) | 23 l.decode('latin-1'),len(l))) |
21 wtype=None | 24 wtype=None |
22 length=None | 25 length=None |
26 state=1 | |
27 tr=None # Was this record truncated? | |
23 while l!=b'\r\n': | 28 while l!=b'\r\n': |
29 if parts & 1: | |
30 buf[bp:(bp:=bp+ln)]=l | |
24 l=stream.readline() | 31 l=stream.readline() |
25 nb+=len(l) | 32 nb+=(ln:=len(l)) |
26 if l.startswith(b'WARC-Type: '): | 33 # WARC header |
34 if l.startswith(b"Content-Length: "): | |
35 length=wl=int(l[16:].rstrip()) | |
36 elif l.startswith(b"WARC-Truncated: "): | |
37 tr=l[16:].rstrip() | |
38 tr="EMPTY" if tr=="" else tr | |
39 elif l.startswith(b'WARC-Type: '): | |
27 wtype = l[11:-2] | 40 wtype = l[11:-2] |
28 elif l.startswith(b'Content-Length: '): | 41 start_2=bp |
29 length = int(l[16:]) | 42 if (wtype in types) and (parts & 1): |
30 bv=memoryview(buf)[:length] | 43 if parts!=1: |
44 buf[bp:(bp:=bp+ln)]=l | |
45 start_2=bp | |
46 if parts!=7: | |
47 callback(wtype,buf[:start_2],1) | |
48 else: | |
49 start_2=0 | |
50 bv=memoryview(buf)[start_2:start_2+length] | |
31 ii=0 | 51 ii=0 |
32 while True: | 52 while True and not stream.closed: |
33 i=stream.readinto(bv) | 53 if (i:=stream.readinto(bv))==0: |
54 break | |
34 ii+=i | 55 ii+=i |
35 if ii>=length: | 56 if ii>=length: |
36 break | 57 break |
37 bv=memoryview(buf)[ii:length] | 58 bv=memoryview(buf)[start_2+ii:start_2+length] |
38 if ii!=length: | 59 if ii!=length: |
39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) | 60 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) |
40 nb+=length | 61 nb+=length |
62 bv=memoryview(buf)[start_2:start_2+length] | |
41 if wtype in types: | 63 if wtype in types: |
42 callback(wtype,memoryview(buf[:length])) | 64 if parts==7: |
43 if whole and options.zipped: | 65 callback(wtype,memoryview(buf)[0:start_2+length],7) |
44 _output(bv) | |
45 return | |
46 gzip_chunk = io.BytesIO(bv) | |
47 uv=memoryview(buf)[length:] | |
48 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: | |
49 ll=0 | |
50 while True: | |
51 l=gzip_fin.readinto(uv) | |
52 if not l: | |
53 break | |
54 ll+=l | |
55 cb=memoryview(uv)[:ll] | |
56 if whole: | |
57 _output(cb) | |
58 return | |
59 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted | |
60 state=0 | |
61 tr=None # Was this record truncated? | |
62 bl=None # for HTTP Content-Length for the length of the body? | |
63 with io.BytesIO(cb) as clear_text: | |
64 for L in clear_text: | |
65 if state==0: | |
66 # WARC header | |
67 if L.startswith(b"Content-Length: "): | |
68 wl=int(L[16:].rstrip()) | |
69 elif L.startswith(b"WARC-Truncated: "): | |
70 tr=L[16:].rstrip() | |
71 tr="EMPTY" if tr=="" else tr | |
72 elif L==b"" or L.startswith(b"\r"): # for idempotency | |
73 # Blank line, WARC header is finished | |
74 if not (options.headers or options.body): | |
75 return | |
76 state=1 | |
77 # Note we preserve the empty line | |
78 if options.warc: | |
79 _output(L) | |
80 continue | 66 continue |
81 if state==1: | 67 # Only output parts (1 = WARC header, 2 = HTTP header, 4 = body) that are wanted |
82 # HTTP header | 68 bl=None # for HTTP Content-Length for the length of the body? |
83 wl -= len(L) | 69 L_start=0 |
84 if not (L==b"" or L.startswith(b"\r")): | 70 state=2 |
85 # Non-blank, it's a header | 71 with io.BytesIO(bv) as rec_text: |
86 if bl is None and L.startswith(b"Content-Length: "): | 72 for L in rec_text: |
87 bl=int(L[16:].rstrip()) | 73 if state==2: |
88 if options.headers: | 74 # HTTP header |
89 _output(L) | 75 wl -= len(L) |
90 else: | 76 if not (L==b"" or L.startswith(b"\r")): |
91 # Blank line, HTTP header is finished | 77 # Non-empty, it's (a continuation of) a header |
92 if not options.body: | 78 if bl is None and L.startswith(b"Content-Length: "): |
93 return | 79 bl=int(L[16:].rstrip()) |
94 if options.headers: | 80 else: |
95 _output(L) | 81 # Blank line, HTTP header is finished |
96 state=2 | 82 if parts & 2: |
97 # The above is just for sanity, because we do _not_ | 83 callback(wtype,bv[start_2:L_start],2) |
98 # continue with the outer loop, | 84 state=4 |
99 # since we can now block-output the entire rest of the | 85 # The above is just for sanity, because we do _not_ |
100 # input buffer. | 86 # continue with the outer loop, |
101 if bl is not None: | 87 # since we can now block-output the entire rest of the |
102 if bl!=wl: | 88 # input buffer. |
103 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ | 89 if bl is not None: |
104 (length,offset,filename,wl,bl,tr),file=sys.stderr) | 90 if bl!=wl: |
105 # HTTP body | 91 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ |
106 balance=clear_text.tell() | 92 (length,offset,filename,wl,bl,tr),file=sys.stderr) |
107 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) | 93 # HTTP body |
108 # Output whatever is left | 94 balance=rec_text.tell() |
109 _output(cb[balance:balance+wl]) | 95 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) |
110 return | 96 # Output whatever is left |
111 | 97 if parts & 4: |
98 callback(wtype,bv[balance:balance+wl],4) | |
99 state=1 | |
100 | |
101 L_start=rec_text.tell() | |
112 OUT=open(sys.stdout.fileno(),'wb') | 102 OUT=open(sys.stdout.fileno(),'wb') |
113 | 103 |
114 import re | 104 import re |
115 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) | 105 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
116 | 106 |
117 def showmeLMH(wtype,buf): | 107 def showmeLMH(wtype,buf,part=2): |
118 m=LMPAT.search(buf.tobytes(order='A')) | 108 m=LMPAT.search(buf.tobytes(order='A')) |
119 if m: | 109 if m: |
120 OUT.write(m[1]) | 110 OUT.write(m[1]) |
121 OUT.write(b'\n') | 111 OUT.write(b'\n') |
122 | 112 |
123 def showme(wtype,buf): | 113 def showme(wtype,buf,part): |
124 OUT.write(buf) | 114 if debug: |
115 breakpoint() | |
116 OUT.write(b"%d\n%b"%(part,buf)) | |
125 | 117 |
126 warc(showmeLMH,[b'response']) | 118 #warc(showmeLMH,[b'response'],2) |
119 | |
120 #warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2])) | |
121 | |
122 warc(showme,[b'response'],int(sys.argv[2])) |