changeset 155:58b90cd52c15

for 2022 exercise
author Henry Thompson <ht@markup.co.uk>
date Fri, 01 Jul 2022 17:50:06 +0200
parents 2643a6825f17
children ace590c2fdfc
files bin/cdx2sql2.py
diffstat 1 files changed, 40 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/cdx2sql2.py	Fri Jul 01 17:50:06 2022 +0200
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+'''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s
+
+Borrows from cdx2sql
+
+Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \
+  sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \
+   '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &'''
+
+import sys, json, io
+
+def process_mime(m):
+  m=m.strip() # Should be handled by CC :-(
+  if '"' in m:
+    # Handle obscure "-escaping conventions of sqlite3
+    m=m.replace('"','""')
+    return ('"%s"'%m,'')
+  elif '\t' in m or '\n' in m:
+    return ('"%s"'%m,'')
+  else:
+    m=m.split('/',maxsplit=1)
+    return (m[0],m[1] if len(m)>1 else '')
+
+cdxno=sys.argv[1]
+
+for l in sys.stdin:
+  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
+  ja=json.loads(jj)
+  fnf=ja['filename'].split('/',maxsplit=5)
+  # Segment number
+  seg=int(fnf[3].split('.')[1])
+  # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
+  wr=fnf[4][0]
+  # URI scheme
+  sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
+  # Language(s)
+  langs=ja.get('languages',None)
+  if langs is None:
+    langs=''
+  print(cdxno,seg,wr,sch,langs,sep='\t')