Mercurial > hg > cc > cirrus_home
changeset 155:58b90cd52c15
for 2022 exercise
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 01 Jul 2022 17:50:06 +0200 |
parents | 2643a6825f17 |
children | ace590c2fdfc |
files | bin/cdx2sql2.py |
diffstat | 1 files changed, 40 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/cdx2sql2.py Fri Jul 01 17:50:06 2022 +0200 @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +'''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s + +Borrows from cdx2sql + +Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \ + sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \ + '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &''' + +import sys, json, io + +def process_mime(m): + m=m.strip() # Should be handled by CC :-( + if '"' in m: + # Handle obscure "-escaping conventions of sqlite3 + m=m.replace('"','""') + return ('"%s"'%m,'') + elif '\t' in m or '\n' in m: + return ('"%s"'%m,'') + else: + m=m.split('/',maxsplit=1) + return (m[0],m[1] if len(m)>1 else '') + +cdxno=sys.argv[1] + +for l in sys.stdin: + (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) + ja=json.loads(jj) + fnf=ja['filename'].split('/',maxsplit=5) + # Segment number + seg=int(fnf[3].split('.')[1]) + # Record type (w for warc, r for robots.txt, c for crawl diagnostics) + wr=fnf[4][0] + # URI scheme + sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') + # Language(s) + langs=ja.get('languages',None) + if langs is None: + langs='' + print(cdxno,seg,wr,sch,langs,sep='\t')