annotate bin/cdx2sql.py @ 159:c3c3dd60b8a8

demo of slurm usage using cdx2tsv.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Jul 2022 18:07:34 +0100
parents c8e41c543c0b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
145
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Replicate part of Jingrui MSc, tabulate a single index
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 file info of segment, language, http[s], mime for feeding to sqlite3
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Borrows from cdx2tsv
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 Usage: cdx2sql.py infiledir i | \
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &'''
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 import sys, json, io
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 from isal import igzip
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 dir=sys.argv[1]
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15 index=int(sys.argv[2])
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16
146
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
17 def process_mime(m):
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
18 m=m.strip() # Should be handled by CC :-(
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
19 if '"' in m:
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
20 # Handle obscure "-escaping conventions of sqlite3
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
21 m=m.replace('"','""')
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
22 return ('"%s"'%m,'')
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
23 elif '\t' in m or '\n' in m:
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
24 return ('"%s"'%m,'')
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
25 else:
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
26 m=m.split('/',maxsplit=1)
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
27 return (m[0],m[1] if len(m)>1 else '')
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
28
145
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29 with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f:
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 for l in f:
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
146
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
32 ja=json.loads(jj)
145
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33 fnf=ja['filename'].split('/',maxsplit=5)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 # Segment number
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 seg=int(fnf[3].split('.')[1])
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37 wr=fnf[4][0]
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38 # URI scheme
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40 # Content-type
146
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
41 m=process_mime(ja['mime'])
145
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 # Sniffed content-type
146
c8e41c543c0b works for 0--9
Henry S. Thompson <ht@inf.ed.ac.uk>
parents: 145
diff changeset
43 md=process_mime(ja.get('mime-detected','/'))
145
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 # Language(s)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 langs=ja.get('languages',None)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 if langs is None:
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 langs=('',)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 ll=0
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 else:
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 langs=langs.split(',')
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 ll=len(langs)
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 print(seg,wr,sch,'\t'.join(m),
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 '\t'.join(md),ll,sep='\t',end='\t')
a6d2b299ccdd replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54 print('\t'.join(langs),'\t'*(3-len(langs)),sep='')