Mercurial > hg > cc > cirrus_home
annotate bin/cdx2tsv.py @ 143:ddff993994be
too clever by half, keys won't work in parallel for e.g. media types
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Wed, 20 Oct 2021 15:47:55 +0000 |
parents | a76cc0df2754 |
children | 66d17f7410f2 |
rev | line source |
---|---|
120
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
121 | 2 '''Extract named fields, with optional post-processing, in order, |
3 from a Common Crawl index row | |
4 | |
5 Field specs on command line are either an atom or | |
6 a tuple of atom and expression with | |
7 free variable f | |
8 For example | |
9 > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx | |
10 will output media type and URI scheme''' | |
120
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
11 import json,sys |
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
12 |
135 | 13 if len(sys.argv)==1 or sys.argv[1][1]=='-': |
14 print("""Reads index lines from stdin and extracts values from json dict part | |
15 | |
16 Usage: cdx2tsv.py fieldspecs... | |
17 | |
18 fieldspec is either a name or a quoted python tuple of a name and | |
19 an expression with free variable f which will be evaluated with f | |
20 having the field value. | |
21 | |
22 For example | |
23 cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx | |
24 will output media type and URI scheme""", | |
25 file=sys.stderr) | |
26 exit(1) | |
27 | |
120
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
28 fields=sys.argv[1:] |
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
29 |
121 | 30 fields=[((lambda x,y:(x,eval("lambda f:%s"%y)))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields] |
120
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
31 for l in sys.stdin: |
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
32 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) |
d0b544e53dda
for use in processing CC index files
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff
changeset
|
33 ja=json.loads(jj) |
121 | 34 print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else |
35 ((f[1](ja[f[0]]) if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields))) | |
36 |