# HG changeset patch # User Henry S. Thompson # Date 1624894810 0 # Node ID 863ea87be6bbcb6377c91ce64b973482a55ab923 # Parent d0b544e53ddaf4b5fe9ac113dc471b34460bb6a4 support field edit diff -r d0b544e53dda -r 863ea87be6bb bin/cdx2tsv.py --- a/bin/cdx2tsv.py Mon Jun 28 14:01:41 2021 +0000 +++ b/bin/cdx2tsv.py Mon Jun 28 15:40:10 2021 +0000 @@ -1,12 +1,21 @@ #!/usr/bin/env python3 -'''Extract named fields, in order, from a Common Crawl index row''' +'''Extract named fields, with optional post-processing, in order, + from a Common Crawl index row + + Field specs on command line are either an atom or + a tuple of atom and expression with + free variable f + For example + > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx + will output media type and URI scheme''' import json,sys fields=sys.argv[1:] +fields=[((lambda x,y:(x,eval("lambda f:%s"%y)))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) - print(ja.keys()) - print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields)) -) + print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else + ((f[1](ja[f[0]]) if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields))) +