changeset 42:689a0e311cd2

make warc.py a library, separate out testing
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 15:37:16 +0100
parents fa43c318749b
children 69be1131bcc5
files bin/lmh_warc.py bin/test_warc.py bin/warc.py
diffstat 3 files changed, 43 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/lmh_warc.py	Wed Jul 05 15:37:16 2023 +0100
@@ -0,0 +1,21 @@
+import re
+TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
+LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
+
+def showmeLMH(wtype,buf,part):
+  global URI
+  if part==1:
+    if (m:=TUPAT.search(buf)):
+      URI=m[1]
+    else:
+      raise ValueError(b"No target URI in %s ??"%buf)
+  else:
+    m=LMPAT.search(buf)
+    OUT.write(URI)
+    if m:
+      OUT.write(b'\t')
+      OUT.write(m[1])
+    OUT.write(b'\n')
+
+warc(showmeLMH,[b'response'],parts=3)
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/test_warc.py	Wed Jul 05 15:37:16 2023 +0100
@@ -0,0 +1,17 @@
+import warc,sys
+
+OUT=open(sys.stdout.fileno(),'wb')
+
+if (debug:=(sys.argv[1]=='-d')):
+  sys.argv.pop(1)
+
+def showme(wtype,buf,part):
+  if debug:
+    breakpoint()
+  OUT.write(b"%d\n%b"%(part,buf))
+
+
+#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
+#warc(showme,[b'response'],whole=True)
+
+warc.warc(sys.argv[1],showme,[b'response'],parts=int(sys.argv[2]),debug=debug)
--- a/bin/warc.py	Wed Jul 05 15:12:54 2023 +0100
+++ b/bin/warc.py	Wed Jul 05 15:37:16 2023 +0100
@@ -1,15 +1,14 @@
 #!/usr/bin/env python3
-'''Stream a warc format file, invoking a callback on each record.
+'''Stream a gzipped warc format file, invoking a callback on each record.
 Callback can be limited by WARC-Type, record part'''
-import sys,os,io
 
-if (debug:=(sys.argv[1]=='-d')):
-  sys.argv.pop(1)
+import sys,io
+from isal import igzip
 
-def warc(callback,types=['response'],whole=False,parts=7):
+def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
-  stream=open(sys.argv[1],'rb',0)
+  stream=open(filename,'rb',0)
   bufsize=128*1024*1024
   buf=bytearray(128*1024*1024)
   l=b'\r\n'
@@ -104,35 +103,3 @@
               state=1
               
           L_start=rec_text.tell()
-OUT=open(sys.stdout.fileno(),'wb')
-
-import re
-TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
-LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
-
-def showmeLMH(wtype,buf,part):
-  global URI
-  if part==1:
-    if (m:=TUPAT.search(buf)):
-      URI=m[1]
-    else:
-      raise ValueError(b"No target URI in %s ??"%buf)
-  else:
-    m=LMPAT.search(buf)
-    OUT.write(URI)
-    if m:
-      OUT.write(b'\t')
-      OUT.write(m[1])
-    OUT.write(b'\n')
-
-def showme(wtype,buf,part):
-  if debug:
-    breakpoint()
-  OUT.write(b"%d\n%b"%(part,buf))
-
-warc(showmeLMH,[b'response'],parts=3)
-
-#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
-
-#warc(showme,[b'response'],parts=int(sys.argv[2]))
-#warc(showme,[b'response'],whole=True)