view bin/cdx2tsv.py @ 159:c3c3dd60b8a8

demo of slurm usage using cdx2tsv.py
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Wed, 06 Jul 2022 18:07:34 +0100
parents ace590c2fdfc
children
line wrap: on
line source

#!/usr/bin/env python3
'''Extract named fields, with optional post-processing, in order,
                         from a Common Crawl index row

   Field specs on command line are either an atom or
                                             a tuple of atom and expression with
                                             free variable f
   For example
    > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
   will output media type and URI scheme'''
import json,sys

if len(sys.argv)==1 or sys.argv[1][1]=='-':
  print("""Reads index lines from stdin and extracts values from json dict part

  Usage: cdx2tsv.py fieldspecs...

  fieldspec is either a name or a quoted python tuple of a name and
    an expression with free variable f which will be evaluated with f
    having the field value.

  For example 
    cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
  will output media type and URI scheme""",
        file=sys.stderr)
  exit(1)

fields=sys.argv[1:]

stash={} # This can be used to save an expensive field computation for re-use:
         # E.g. '(filename,(lambda g:(stash.__setitem__("T1",g) or g)[3].split(".")[1])(f.split("/",maxsplit=5)))'  '(filename,(stash["T1"])[4][0])

fields=[((lambda x,y:(x,eval("lambda f:%s"%y,globals())))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields]
for l in sys.stdin:
  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
  ja=json.loads(jj)
  #print(ja)
  print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else
                   (f[1](ja[f[0]] if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))