# HG changeset patch # User Henry S. Thompson # Date 1624888901 0 # Node ID d0b544e53ddaf4b5fe9ac113dc471b34460bb6a4 # Parent bc958b776fb87e3a25b577bfcefbcb8a98b33196 for use in processing CC index files diff -r bc958b776fb8 -r d0b544e53dda bin/cdx2tsv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/cdx2tsv.py Mon Jun 28 14:01:41 2021 +0000 @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +'''Extract named fields, in order, from a Common Crawl index row''' +import json,sys + +fields=sys.argv[1:] + +for l in sys.stdin: + (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) + ja=json.loads(jj) + print(ja.keys()) + print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields)) +)