Mercurial > hg > cc > cirrus_home
view bin/cdx2tsv.py @ 161:df56132ef84a
x
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 17:59:43 +0100 |
parents | ace590c2fdfc |
children |
line wrap: on
line source
#!/usr/bin/env python3 '''Extract named fields, with optional post-processing, in order, from a Common Crawl index row Field specs on command line are either an atom or a tuple of atom and expression with free variable f For example > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx will output media type and URI scheme''' import json,sys if len(sys.argv)==1 or sys.argv[1][1]=='-': print("""Reads index lines from stdin and extracts values from json dict part Usage: cdx2tsv.py fieldspecs... fieldspec is either a name or a quoted python tuple of a name and an expression with free variable f which will be evaluated with f having the field value. For example cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx will output media type and URI scheme""", file=sys.stderr) exit(1) fields=sys.argv[1:] stash={} # This can be used to save an expensive field computation for re-use: # E.g. '(filename,(lambda g:(stash.__setitem__("T1",g) or g)[3].split(".")[1])(f.split("/",maxsplit=5)))' '(filename,(stash["T1"])[4][0]) fields=[((lambda x,y:(x,eval("lambda f:%s"%y,globals())))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) #print(ja) print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else (f[1](ja[f[0]] if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))