155
|
1 #!/usr/bin/env python3
|
|
2 '''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s
|
|
3
|
|
4 Borrows from cdx2sql
|
|
5
|
|
6 Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \
|
|
7 sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \
|
|
8 '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &'''
|
|
9
|
|
10 import sys, json, io
|
|
11
|
|
12 def process_mime(m):
|
|
13 m=m.strip() # Should be handled by CC :-(
|
|
14 if '"' in m:
|
|
15 # Handle obscure "-escaping conventions of sqlite3
|
|
16 m=m.replace('"','""')
|
|
17 return ('"%s"'%m,'')
|
|
18 elif '\t' in m or '\n' in m:
|
|
19 return ('"%s"'%m,'')
|
|
20 else:
|
|
21 m=m.split('/',maxsplit=1)
|
|
22 return (m[0],m[1] if len(m)>1 else '')
|
|
23
|
|
24 cdxno=sys.argv[1]
|
|
25
|
|
26 for l in sys.stdin:
|
|
27 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
|
|
28 ja=json.loads(jj)
|
|
29 fnf=ja['filename'].split('/',maxsplit=5)
|
|
30 # Segment number
|
|
31 seg=int(fnf[3].split('.')[1])
|
|
32 # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
|
|
33 wr=fnf[4][0]
|
|
34 # URI scheme
|
|
35 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
|
|
36 # Language(s)
|
|
37 langs=ja.get('languages',None)
|
|
38 if langs is None:
|
|
39 langs=''
|
|
40 print(cdxno,seg,wr,sch,langs,sep='\t')
|