view bin/cdx2sql.py @ 178:e1bc9d8d688c

ec184 now, run w. unbuffered output
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 05 Jul 2023 15:02:53 +0100
parents c8e41c543c0b
children
line wrap: on
line source

#!/usr/bin/env python3
'''Replicate part of Jingrui MSc, tabulate a single index
file info of segment, language, http[s], mime for feeding to sqlite3

Borrows from cdx2tsv

Usage: cdx2sql.py infiledir i | \
  sqlite3 seg$i.db '.read ../cdx.sql' '.mode tabs' \
   '.import /dev/stdin props' '.quit' 2> seg$i.log ; done &'''

import sys, json, io
from isal import igzip

dir=sys.argv[1]
index=int(sys.argv[2])

def process_mime(m):
  m=m.strip() # Should be handled by CC :-(
  if '"' in m:
    # Handle obscure "-escaping conventions of sqlite3
    m=m.replace('"','""')
    return ('"%s"'%m,'')
  elif '\t' in m or '\n' in m:
    return ('"%s"'%m,'')
  else:
    m=m.split('/',maxsplit=1)
    return (m[0],m[1] if len(m)>1 else '')

with igzip.open("%s/cdx-%05.0f.gz"%(dir,index),'rt') as f:
  for l in f:
    (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
    ja=json.loads(jj)
    fnf=ja['filename'].split('/',maxsplit=5)
    # Segment number
    seg=int(fnf[3].split('.')[1])
    # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
    wr=fnf[4][0]
    # URI scheme
    sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
    # Content-type
    m=process_mime(ja['mime'])
    # Sniffed content-type
    md=process_mime(ja.get('mime-detected','/'))
    # Language(s)
    langs=ja.get('languages',None)
    if langs is None:
      langs=('',)
      ll=0
    else:
      langs=langs.split(',')
      ll=len(langs)   
    print(seg,wr,sch,'\t'.join(m),
          '\t'.join(md),ll,sep='\t',end='\t')
    print('\t'.join(langs),'\t'*(3-len(langs)),sep='')