diff bin/warc.py @ 42:689a0e311cd2

make warc.py a library, separate out testing
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 15:37:16 +0100
parents 8661062a50b1
children 69be1131bcc5
line wrap: on
line diff
--- a/bin/warc.py	Wed Jul 05 15:12:54 2023 +0100
+++ b/bin/warc.py	Wed Jul 05 15:37:16 2023 +0100
@@ -1,15 +1,14 @@
 #!/usr/bin/env python3
-'''Stream a warc format file, invoking a callback on each record.
+'''Stream a gzipped warc format file, invoking a callback on each record.
 Callback can be limited by WARC-Type, record part'''
-import sys,os,io
 
-if (debug:=(sys.argv[1]=='-d')):
-  sys.argv.pop(1)
+import sys,io
+from isal import igzip
 
-def warc(callback,types=['response'],whole=False,parts=7):
+def warc(filename,callback,types=['response'],whole=False,parts=7,debug=False):
   types=[(t if isinstance(t,bytes) else bytes(t,'utf8')) for t in types]
   nb=0
-  stream=open(sys.argv[1],'rb',0)
+  stream=open(filename,'rb',0)
   bufsize=128*1024*1024
   buf=bytearray(128*1024*1024)
   l=b'\r\n'
@@ -104,35 +103,3 @@
               state=1
               
           L_start=rec_text.tell()
-OUT=open(sys.stdout.fileno(),'wb')
-
-import re
-TUPAT=re.compile(b'^WARC-Target-URI: (.*?)\r',re.MULTILINE)
-LMPAT=re.compile(b'^Last-Modified: (.*?)\r',re.MULTILINE)
-
-def showmeLMH(wtype,buf,part):
-  global URI
-  if part==1:
-    if (m:=TUPAT.search(buf)):
-      URI=m[1]
-    else:
-      raise ValueError(b"No target URI in %s ??"%buf)
-  else:
-    m=LMPAT.search(buf)
-    OUT.write(URI)
-    if m:
-      OUT.write(b'\t')
-      OUT.write(m[1])
-    OUT.write(b'\n')
-
-def showme(wtype,buf,part):
-  if debug:
-    breakpoint()
-  OUT.write(b"%d\n%b"%(part,buf))
-
-warc(showmeLMH,[b'response'],parts=3)
-
-#warc(showme,[b'response','warcinfo','request','metadata'],int(sys.argv[2]))
-
-#warc(showme,[b'response'],parts=int(sys.argv[2]))
-#warc(showme,[b'response'],whole=True)