Mercurial > hg > cc > cirrus_home
annotate bin/warc.py @ 159:c3c3dd60b8a8
demo of slurm usage using cdx2tsv.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 Jul 2022 18:07:34 +0100 |
parents | e96d444b0f84 |
children | d123ef7fdb82 |
rev | line source |
---|---|
138 | 1 #!/usr/bin/env python3 |
2 '''Stream a warc format file, invoking a callback on each part. | |
3 Callback can be limited by WARC-Type''' | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
4 import sys,os |
138 | 5 |
6 def warc(callback,types=['response']): | |
7 nb=0 | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
8 stream=open(sys.argv[1],'rb',0) |
138 | 9 bufsize=128*1024*1024 |
10 buf=bytearray(128*1024*1024) | |
11 l=b'\r\n' | |
12 while True: | |
13 while l==b'\r\n': | |
14 l=stream.readline() | |
15 nb+=len(l) | |
16 if l!=b'WARC/1.0\r\n': | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
17 if l==b'': |
138 | 18 return |
19 raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l), | |
20 l.decode('latin-1'),len(l))) | |
21 wtype=None | |
22 length=None | |
23 while l!=b'\r\n': | |
24 l=stream.readline() | |
25 nb+=len(l) | |
26 if l.startswith(b'WARC-Type: '): | |
27 wtype = l[11:-2] | |
28 elif l.startswith(b'Content-Length: '): | |
29 length = int(l[16:]) | |
30 bv=memoryview(buf)[:length] | |
31 ii=0 | |
32 while True: | |
33 i=stream.readinto(bv) | |
34 ii+=i | |
35 if ii>=length: | |
36 break | |
37 bv=memoryview(buf)[ii:length] | |
38 if ii!=length: | |
39 raise ValueError("Chunk read losing, from %s got %s expected %s"%(nb,ii,length)) | |
40 nb+=length | |
41 if wtype in types: | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
42 callback(wtype,memoryview(buf[:length])) |
138 | 43 if whole and options.zipped: |
44 _output(bv) | |
45 return | |
46 gzip_chunk = io.BytesIO(bv) | |
47 uv=memoryview(buf)[length:] | |
48 with igzip.IGzipFile(fileobj=gzip_chunk) as gzip_fin: | |
49 ll=0 | |
50 while True: | |
51 l=gzip_fin.readinto(uv) | |
52 if not l: | |
53 break | |
54 ll+=l | |
55 cb=memoryview(uv)[:ll] | |
56 if whole: | |
57 _output(cb) | |
58 return | |
59 # Only output parts (0 = WARC header, 1 = HTTP header, 2 = body) that are wanted | |
60 state=0 | |
61 tr=None # Was this record truncated? | |
62 bl=None # for HTTP Content-Length for the length of the body? | |
63 with io.BytesIO(cb) as clear_text: | |
64 for L in clear_text: | |
65 if state==0: | |
66 # WARC header | |
67 if L.startswith(b"Content-Length: "): | |
68 wl=int(L[16:].rstrip()) | |
69 elif L.startswith(b"WARC-Truncated: "): | |
70 tr=L[16:].rstrip() | |
71 tr="EMPTY" if tr=="" else tr | |
72 elif L==b"" or L.startswith(b"\r"): # for idempotency | |
73 # Blank line, WARC header is finished | |
74 if not (options.headers or options.body): | |
75 return | |
76 state=1 | |
77 # Note we preserve the empty line | |
78 if options.warc: | |
79 _output(L) | |
80 continue | |
81 if state==1: | |
82 # HTTP header | |
83 wl -= len(L) | |
84 if not (L==b"" or L.startswith(b"\r")): | |
85 # Non-blank, it's a header | |
86 if bl is None and L.startswith(b"Content-Length: "): | |
87 bl=int(L[16:].rstrip()) | |
88 if options.headers: | |
89 _output(L) | |
90 else: | |
91 # Blank line, HTTP header is finished | |
92 if not options.body: | |
93 return | |
94 if options.headers: | |
95 _output(L) | |
96 state=2 | |
97 # The above is just for sanity, because we do _not_ | |
98 # continue with the outer loop, | |
99 # since we can now block-output the entire rest of the | |
100 # input buffer. | |
101 if bl is not None: | |
102 if bl!=wl: | |
103 print("length mismatch: %s %s %s here: %s given: %s trunc: %s"%\ | |
104 (length,offset,filename,wl,bl,tr),file=sys.stderr) | |
105 # HTTP body | |
106 balance=clear_text.tell() | |
107 #print(balance,bl,wl,ll,ll-balance,file=sys.stderr) | |
108 # Output whatever is left | |
109 _output(cb[balance:balance+wl]) | |
110 return | |
111 | |
112 OUT=open(sys.stdout.fileno(),'wb') | |
113 | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
114 import re |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
115 LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE) |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
116 |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
117 def showmeLMH(wtype,buf): |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
118 m=LMPAT.search(buf.tobytes(order='A')) |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
119 if m: |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
120 OUT.write(m[1]) |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
121 OUT.write(b'\n') |
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
122 |
138 | 123 def showme(wtype,buf): |
124 OUT.write(buf) | |
125 | |
139
e96d444b0f84
fixed bug(s) wrt large payload files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
138
diff
changeset
|
126 warc(showmeLMH,[b'response']) |