view bin/cdx2tsv.py @ 120:d0b544e53dda

for use in processing CC index files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 14:01:41 +0000
parents
children 863ea87be6bb
line wrap: on
line source

#!/usr/bin/env python3
'''Extract named fields, in order, from a Common Crawl index row'''
import json,sys

fields=sys.argv[1:]

for l in sys.stdin:
  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
  ja=json.loads(jj)
  print(ja.keys())
  print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields))
)