Mercurial > hg > cc > cirrus_home
view bin/cdx2sql.py @ 195:5f3c36e4fd6d default tip
add target test-core which (dangerously) avoids (we hope pointless) recompilation of all the plugins
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Thu, 26 Sep 2024 17:55:56 +0100 |
parents | c8e41c543c0b |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Replicate part of Jingrui MSc, tabulate a single index file info of segment, language, http[s], mime for feeding to sqlite3 Borrows from cdx2tsv Usage: cdx2sql.py infiledir i | \ sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \ '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &''' import sys, json, io from isal import igzip dir=sys.argv[1] index=int(sys.argv[2]) def process_mime(m): m=m.strip() # Should be handled by CC :-( if '"' in m: # Handle obscure "-escaping conventions of sqlite3 m=m.replace('"','""') return ('"%s"'%m,'') elif '\t' in m or '\n' in m: return ('"%s"'%m,'') else: m=m.split('/',maxsplit=1) return (m[0],m[1] if len(m)>1 else '') with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f: for l in f: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) fnf=ja['filename'].split('/',maxsplit=5) # Segment number seg=int(fnf[3].split('.')[1]) # Record type (w for warc, r for robots.txt, c for crawl diagnostics) wr=fnf[4][0] # URI scheme sch=int((ja['url'].split(':',maxsplit=1)[0])=='https') # Content-type m=process_mime(ja['mime']) # Sniffed content-type md=process_mime(ja.get('mime-detected','/')) # Language(s) langs=ja.get('languages',None) if langs is None: langs=('',) ll=0 else: langs=langs.split(',') ll=len(langs) print(seg,wr,sch,'\t'.join(m), '\t'.join(md),ll,sep='\t',end='\t') print('\t'.join(langs),'\t'*(3-len(langs)),sep='')