Mercurial > hg > cc > cirrus_home
annotate bin/cdx2sql.py @ 159:c3c3dd60b8a8
demo of slurm usage using cdx2tsv.py
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 06 Jul 2022 18:07:34 +0100 |
parents | c8e41c543c0b |
children |
rev | line source |
---|---|
145
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
2 '''Replicate part of Jingrui MSc, tabulate a single index |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
3 file info of segment, language, http[s], mime for feeding to sqlite3 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
4 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
5 Borrows from cdx2tsv |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
6 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
7 Usage: cdx2sql.py infiledir i | \ |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
8 sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \ |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
9 '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &''' |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
10 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 import sys, json, io |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 from isal import igzip |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
13 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
14 dir=sys.argv[1] |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
15 index=int(sys.argv[2]) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
16 |
146 | 17 def process_mime(m): |
18 m=m.strip() # Should be handled by CC :-( | |
19 if '"' in m: | |
20 # Handle obscure "-escaping conventions of sqlite3 | |
21 m=m.replace('"','""') | |
22 return ('"%s"'%m,'') | |
23 elif '\t' in m or '\n' in m: | |
24 return ('"%s"'%m,'') | |
25 else: | |
26 m=m.split('/',maxsplit=1) | |
27 return (m[0],m[1] if len(m)>1 else '') | |
28 | |
145
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
30 for l in f: |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) |
146 | 32 ja=json.loads(jj) |
145
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 fnf=ja['filename'].split('/',maxsplit=5) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
34 # Segment number |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
35 seg=int(fnf[3].split('.')[1]) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
36 # Record type (w for warc, r for robots.txt, c for crawl diagnostics) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
37 wr=fnf[4][0] |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
38 # URI scheme |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
39 sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
40 # Content-type |
146 | 41 m=process_mime(ja['mime']) |
145
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
42 # Sniffed content-type |
146 | 43 md=process_mime(ja.get('mime-detected','/')) |
145
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
44 # Language(s) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
45 langs=ja.get('languages',None) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
46 if langs is None: |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
47 langs=('',) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
48 ll=0 |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
49 else: |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
50 langs=langs.split(',') |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
51 ll=len(langs) |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
52 print(seg,wr,sch,'\t'.join(m), |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
53 '\t'.join(md),ll,sep='\t',end='\t') |
a6d2b299ccdd
replace too-complex invocation of cdx2tsv
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
54 print('\t'.join(langs),'\t'*(3-len(langs)),sep='') |