comparison bin/cdx2tsv.py @ 120:d0b544e53dda

for use in processing CC index files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 14:01:41 +0000
parents
children 863ea87be6bb
comparison
equal deleted inserted replaced
119:bc958b776fb8 120:d0b544e53dda
1 #!/usr/bin/env python3
2 '''Extract named fields, in order, from a Common Crawl index row'''
3 import json,sys
4
5 fields=sys.argv[1:]
6
7 for l in sys.stdin:
8 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
9 ja=json.loads(jj)
10 print(ja.keys())
11 print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields))
12 )