changeset 121:863ea87be6bb

support field edit
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 15:40:10 +0000
parents d0b544e53dda
children 9de06ae73372
files bin/cdx2tsv.py
diffstat 1 files changed, 13 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/bin/cdx2tsv.py	Mon Jun 28 14:01:41 2021 +0000
+++ b/bin/cdx2tsv.py	Mon Jun 28 15:40:10 2021 +0000
@@ -1,12 +1,21 @@
 #!/usr/bin/env python3
-'''Extract named fields, in order, from a Common Crawl index row'''
+'''Extract named fields, with optional post-processing, in order,
+                         from a Common Crawl index row
+
+   Field specs on command line are either an atom or
+                                             a tuple of atom and expression with
+                                             free variable f
+   For example
+    > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
+   will output media type and URI scheme'''
 import json,sys
 
 fields=sys.argv[1:]
 
+fields=[((lambda x,y:(x,eval("lambda f:%s"%y)))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields]
 for l in sys.stdin:
   (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
   ja=json.loads(jj)
-  print(ja.keys())
-  print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields))
-)
+  print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else
+                   ((f[1](ja[f[0]]) if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))
+