Mercurial > hg > cc > cirrus_home
comparison bin/cdx2sql.py @ 145:a6d2b299ccdd
replace too-complex invocation of cdx2tsv
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 21 Oct 2021 19:18:47 +0000 |
parents | |
children | c8e41c543c0b |
comparison
equal
deleted
inserted
replaced
144:3abcb61e0bd9 | 145:a6d2b299ccdd |
---|---|
1 #!/usr/bin/env python3 | |
2 '''Replicate part of Jingrui MSc, tabulate a single index | |
3 file info of segment, language, http[s], mime for feeding to sqlite3 | |
4 | |
5 Borrows from cdx2tsv | |
6 | |
7 Usage: cdx2sql.py infiledir i | \ | |
8 sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \ | |
9 '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &''' | |
10 | |
11 import sys, json, io | |
12 from isal import igzip | |
13 | |
14 dir=sys.argv[1] | |
15 index=int(sys.argv[2]) | |
16 | |
17 with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: | |
18 for l in f: | |
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) | |
20 ja=json.loads(jj.replace('\\','\\\\')) # some values observed | |
21 # to include \t :-( | |
22 fnf=ja['filename'].split('/',maxsplit=5) | |
23 # Segment number | |
24 seg=int(fnf[3].split('.')[1]) | |
25 # Record type (w for warc, r for robots.txt, c for crawl diagnostics) | |
26 wr=fnf[4][0] | |
27 # URI scheme | |
28 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') | |
29 # Content-type | |
30 m=ja['mime'] | |
31 m=m.split('/',maxsplit=1) | |
32 m=(m[0],m[1] if len(m)>1 else '') | |
33 # Sniffed content-type | |
34 md=ja.get('mime-detected','/').split('/',maxsplit=1) | |
35 md=(md[0],md[1] if len(md)>1 else '') | |
36 # Language(s) | |
37 langs=ja.get('languages',None) | |
38 if langs is None: | |
39 langs=('',) | |
40 ll=0 | |
41 else: | |
42 langs=langs.split(',') | |
43 ll=len(langs) | |
44 print(seg,wr,sch,'\t'.join(m), | |
45 '\t'.join(md),ll,sep='\t',end='\t') | |
46 print('\t'.join(langs),'\t'*(3-len(langs)),sep='') |