annotate bin/cdx2sql2.py @ 195:5f3c36e4fd6d default tip

add target test-core which (dangerously) avoids (we hope pointless) recompilation of all the plugins
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Thu, 26 Sep 2024 17:55:56 +0100
parents 58b90cd52c15
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
155
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
2 '''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
3
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
4 Borrows from cdx2sql
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
5
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
6 Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
7 sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
8 '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &'''
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
9
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
10 import sys, json, io
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
11
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
12 def process_mime(m):
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
13 m=m.strip() # Should be handled by CC :-(
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
14 if '"' in m:
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
15 # Handle obscure "-escaping conventions of sqlite3
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
16 m=m.replace('"','""')
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
17 return ('"%s"'%m,'')
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
18 elif '\t' in m or '\n' in m:
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
19 return ('"%s"'%m,'')
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
20 else:
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
21 m=m.split('/',maxsplit=1)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
22 return (m[0],m[1] if len(m)>1 else '')
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
23
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
24 cdxno=sys.argv[1]
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
25
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
26 for l in sys.stdin:
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
27 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
28 ja=json.loads(jj)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
29 fnf=ja['filename'].split('/',maxsplit=5)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
30 # Segment number
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
31 seg=int(fnf[3].split('.')[1])
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
32 # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
33 wr=fnf[4][0]
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
34 # URI scheme
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
35 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
36 # Language(s)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
37 langs=ja.get('languages',None)
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
38 if langs is None:
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
39 langs=''
58b90cd52c15 for 2022 exercise
Henry Thompson <ht@markup.co.uk>
parents:
diff changeset
40 print(cdxno,seg,wr,sch,langs,sep='\t')