changeset 120:d0b544e53dda

for use in processing CC index files
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 14:01:41 +0000
parents bc958b776fb8
children 863ea87be6bb
files bin/cdx2tsv.py
diffstat 1 files changed, 12 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/cdx2tsv.py	Mon Jun 28 14:01:41 2021 +0000
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+'''Extract named fields, in order, from a Common Crawl index row'''
+import json,sys
+
+fields=sys.argv[1:]
+
+for l in sys.stdin:
+  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
+  ja=json.loads(jj)
+  print(ja.keys())
+  print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields))
+)