Mercurial > hg > cc > cirrus_home
comparison bin/cdx2sql2.py @ 155:58b90cd52c15
for 2022 exercise
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 01 Jul 2022 17:50:06 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
154:2643a6825f17 | 155:58b90cd52c15 |
---|---|
1 #!/usr/bin/env python3 | |
2 '''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s | |
3 | |
4 Borrows from cdx2sql | |
5 | |
6 Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \ | |
7 sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \ | |
8 '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &''' | |
9 | |
10 import sys, json, io | |
11 | |
12 def process_mime(m): | |
13 m=m.strip() # Should be handled by CC :-( | |
14 if '"' in m: | |
15 # Handle obscure "-escaping conventions of sqlite3 | |
16 m=m.replace('"','""') | |
17 return ('"%s"'%m,'') | |
18 elif '\t' in m or '\n' in m: | |
19 return ('"%s"'%m,'') | |
20 else: | |
21 m=m.split('/',maxsplit=1) | |
22 return (m[0],m[1] if len(m)>1 else '') | |
23 | |
24 cdxno=sys.argv[1] | |
25 | |
26 for l in sys.stdin: | |
27 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) | |
28 ja=json.loads(jj) | |
29 fnf=ja['filename'].split('/',maxsplit=5) | |
30 # Segment number | |
31 seg=int(fnf[3].split('.')[1]) | |
32 # Record type (w for warc, r for robots.txt, c for crawl diagnostics) | |
33 wr=fnf[4][0] | |
34 # URI scheme | |
35 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') | |
36 # Language(s) | |
37 langs=ja.get('languages',None) | |
38 if langs is None: | |
39 langs='' | |
40 print(cdxno,seg,wr,sch,langs,sep='\t') |