comparison bin/cdx2tsv.py @ 121:863ea87be6bb

support field edit
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 15:40:10 +0000
parents d0b544e53dda
children a76cc0df2754
comparison
equal deleted inserted replaced
120:d0b544e53dda 121:863ea87be6bb
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 '''Extract named fields, in order, from a Common Crawl index row''' 2 '''Extract named fields, with optional post-processing, in order,
3 from a Common Crawl index row
4
5 Field specs on command line are either an atom or
6 a tuple of atom and expression with
7 free variable f
8 For example
9 > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
10 will output media type and URI scheme'''
3 import json,sys 11 import json,sys
4 12
5 fields=sys.argv[1:] 13 fields=sys.argv[1:]
6 14
15 fields=[((lambda x,y:(x,eval("lambda f:%s"%y)))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields]
7 for l in sys.stdin: 16 for l in sys.stdin:
8 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) 17 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
9 ja=json.loads(jj) 18 ja=json.loads(jj)
10 print(ja.keys()) 19 print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else
11 print('\t'.join(ja.get(f,'NA') for f in (ja.keys() if fields==["*"] else fields)) 20 ((f[1](ja[f[0]]) if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))
12 ) 21