view bin/cdx2sql2.py @ 164:00b14a35280e

work-path bin dir
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 18:30:56 +0100
parents 58b90cd52c15
children
line wrap: on
line source

#!/usr/bin/env python3
'''Implement one file's worth of cdx exercise, 2022, i.e. cdxno seg type langs http/s

Borrows from cdx2sql

Usage: gnuzip -c cdx_00{i}.gz | cdx2sql.py i | \
  sqlite3 idx.db '.read ../cdx.sql' '.mode tabs' \
   '.import /dev/stdin props' '.quit' 2> idx$i.log ; done &'''

import sys, json, io

def process_mime(m):
  m=m.strip() # Should be handled by CC :-(
  if '"' in m:
    # Handle obscure "-escaping conventions of sqlite3
    m=m.replace('"','""')
    return ('"%s"'%m,'')
  elif '\t' in m or '\n' in m:
    return ('"%s"'%m,'')
  else:
    m=m.split('/',maxsplit=1)
    return (m[0],m[1] if len(m)>1 else '')

cdxno=sys.argv[1]

for l in sys.stdin:
  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
  ja=json.loads(jj)
  fnf=ja['filename'].split('/',maxsplit=5)
  # Segment number
  seg=int(fnf[3].split('.')[1])
  # Record type (w for warc, r for robots.txt, c for crawl diagnostics)
  wr=fnf[4][0]
  # URI scheme
  sch=int((ja['url'].split(':',maxsplit=1)[0])=='https')
  # Language(s)
  langs=ja.get('languages',None)
  if langs is None:
    langs=''
  print(cdxno,seg,wr,sch,langs,sep='\t')