changeset 145:a6d2b299ccdd

replace too-complex invocation of cdx2tsv
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 21 Oct 2021 19:18:47 +0000
parents 3abcb61e0bd9
children c8e41c543c0b
files bin/cdx2sql.py
diffstat 1 files changed, 46 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/cdx2sql.py	Thu Oct 21 19:18:47 2021 +0000
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+'''Replicate part of Jingrui MSc, tabulate a single index
+file info of segment, language, http[s], mime for feeding to sqlite3
+
+Borrows from cdx2tsv
+
+Usage: cdx2sql.py infiledir i | \
+  sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \
+   '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &'''
+
+import sys, json, io
+from isal import igzip
+
+dir=sys.argv[1]
+index=int(sys.argv[2])
+
+with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f:
+  for l in f:
+    (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
+    ja=json.loads(jj.replace('\\','\\\\')) # some values observed
+                                           # to include \t :-(
+    fnf=ja['filename'].split('/',maxsplit=5)
+    # Segment number
+    seg=int(fnf[3].split('.')[1])
+    # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
+    wr=fnf[4][0]
+    # URI scheme
+    sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
+    # Content-type
+    m=ja['mime']
+    m=m.split('/',maxsplit=1)
+    m=(m[0],m[1] if len(m)>1 else '')
+    # Sniffed content-type
+    md=ja.get('mime-detected','/').split('/',maxsplit=1)
+    md=(md[0],md[1] if len(md)>1 else '')
+    # Language(s)
+    langs=ja.get('languages',None)
+    if langs is None:
+      langs=('',)
+      ll=0
+    else:
+      langs=langs.split(',')
+      ll=len(langs)   
+    print(seg,wr,sch,'\t'.join(m),
+          '\t'.join(md),ll,sep='\t',end='\t')
+    print('\t'.join(langs),'\t'*(3-len(langs)),sep='')