changeset 48:d0d2fd9830d6

starting on conversion to direct-querying of buffer
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 06 Jul 2023 13:27:33 +0100
parents b59f49909bda
children 699ef141af10
files bin/warc.py
diffstat 1 files changed, 24 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/bin/warc.py	Thu Jul 06 10:19:02 2023 +0100
+++ b/bin/warc.py	Thu Jul 06 13:27:33 2023 +0100
@@ -13,37 +13,38 @@
     stream=igzip.IGzipFile(filename=filename)
   else:
     stream=open(filename,'rb',0)
-  bufsize=2*1024*1024
-  buf=bytearray(bufsize)
-  l=b'\r\n'
+  bufSize=2*1024*1024
+  hdrMax=16*1024
+  buf=bytearray(bufSize)
+  hdrBuf=memoryview(buf)[:hdrMax]
   while not stream.closed:
     bp=0
-    while l==b'\r\n':
-      l=stream.readline()
-      nb+=(ln:=len(l))
-    if ln==0:
+    bl=stream.readinto(hdrBuf)
+    if bl==0:
       break
-    if l!=b'WARC/1.0\r\n':
-      raise ValueError("Not a WARC file? At %s: %s[%s]"%(nb-len(l),
-                                                         l.decode('latin-1'),len(l)))
+    while buf.startswith(b'\r\n',bp):
+      bp+=2
+    if not buf.startswith(b'WARC/1.0\r\n',bp):
+      raise ValueError("Not a WARC file? At %s: %s[%s]"%(bp,
+                       buf[bp:min(bl,bp+20)].decode('latin-1'), bl-bp))
+    bob=bp # in case 1 or whole
+    bp+=10
     wtype=None
     length=None
     state=1
     tr=None # Was this record truncated?
-    while l!=b'\r\n':
-      # WARC header
-      if parts & 1:
-        buf[bp:(bp:=bp+ln)]=l
-      l=stream.readline()
-      nb+=(ln:=len(l))
-      if l.startswith(b"Content-Length: "):
-        length=wl=int(l[16:].rstrip())
-      elif l.startswith(b"WARC-Truncated: "):
-        tr=l[16:].rstrip()
+    while not buf.startswith(b'\r\n',bp):
+      print('yes',)
+      eol=buf.index(b'\r\n',bp)+2
+      if buf.startswith(b"Content-Length: ",bp):
+        length=wl=int(buf[bp+16:eol-2])
+      elif buf.startswith(b"WARC-Truncated: ",bp):
+        tr=l[bp+16:eol-2]
         tr="EMPTY" if tr=="" else tr
-      elif l.startswith(b'WARC-Type: '):
-        wtype = l[11:-2]
-    start_2=bp
+      elif buf.startswith(b'WARC-Type: ',bp):
+        wtype = bytes(buf[bp+11:eol-2])
+      bp=eol
+    start_2=eol
     if (wtype in types):
       if whole:
         buf[bp:(bp:=bp+ln)]=l