view bin/cdx2tsv.py @ 125:cd927e5c133f

extract Last Modified via cdx
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 28 Jun 2021 21:50:30 +0000
parents 863ea87be6bb
children a76cc0df2754
line wrap: on
line source

#!/usr/bin/env python3
'''Extract named fields, with optional post-processing, in order,
                         from a Common Crawl index row

   Field specs on command line are either an atom or
                                             a tuple of atom and expression with
                                             free variable f
   For example
    > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
   will output media type and URI scheme'''
import json,sys

fields=sys.argv[1:]

fields=[((lambda x,y:(x,eval("lambda f:%s"%y)))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields]
for l in sys.stdin:
  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
  ja=json.loads(jj)
  print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else
                   ((f[1](ja[f[0]]) if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))