Mercurial > hg > cc > cirrus_home
changeset 145:a6d2b299ccdd
replace too-complex invocation of cdx2tsv
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 21 Oct 2021 19:18:47 +0000 |
parents | 3abcb61e0bd9 |
children | c8e41c543c0b |
files | bin/cdx2sql.py |
diffstat | 1 files changed, 46 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/cdx2sql.py Thu Oct 21 19:18:47 2021 +0000 @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +'''Replicate part of Jingrui MSc, tabulate a single index +file info of segment, language, http[s], mime for feeding to sqlite3 + +Borrows from cdx2tsv + +Usage: cdx2sql.py infiledir i | \ + sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \ + '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &''' + +import sys, json, io +from isal import igzip + +dir=sys.argv[1] +index=int(sys.argv[2]) + +with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: + for l in f: + (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) + ja=json.loads(jj.replace('\\','\\\\')) # some values observed + # to include \t :-( + fnf=ja['filename'].split('/',maxsplit=5) + # Segment number + seg=int(fnf[3].split('.')[1]) + # Record type (w for warc, r for robots.txt, c for crawl diagnostics) + wr=fnf[4][0] + # URI scheme + sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') + # Content-type + m=ja['mime'] + m=m.split('/',maxsplit=1) + m=(m[0],m[1] if len(m)>1 else '') + # Sniffed content-type + md=ja.get('mime-detected','/').split('/',maxsplit=1) + md=(md[0],md[1] if len(md)>1 else '') + # Language(s) + langs=ja.get('languages',None) + if langs is None: + langs=('',) + ll=0 + else: + langs=langs.split(',') + ll=len(langs) + print(seg,wr,sch,'\t'.join(m), + '\t'.join(md),ll,sep='\t',end='\t') + print('\t'.join(langs),'\t'*(3-len(langs)),sep='')