Mercurial > hg > cc > cirrus_home
changeset 162:e82981075b4a
moved to shared/bin
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 18 Jul 2022 18:11:46 +0100 |
parents | df56132ef84a |
children | ef961d91eea5 |
files | bin/cdx2tsv.py bin/uniq_merge.py |
diffstat | 2 files changed, 0 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/bin/cdx2tsv.py Mon Jul 18 17:59:43 2022 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -'''Extract named fields, with optional post-processing, in order, - from a Common Crawl index row - - Field specs on command line are either an atom or - a tuple of atom and expression with - free variable f - For example - > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx - will output media type and URI scheme''' -import json,sys - -if len(sys.argv)==1 or sys.argv[1][1]=='-': - print("""Reads index lines from stdin and extracts values from json dict part - - Usage: cdx2tsv.py fieldspecs... - - fieldspec is either a name or a quoted python tuple of a name and - an expression with free variable f which will be evaluated with f - having the field value. - - For example - cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx - will output media type and URI scheme""", - file=sys.stderr) - exit(1) - -fields=sys.argv[1:] - -stash={} # This can be used to save an expensive field computation for re-use: - # E.g. '(filename,(lambda g:(stash.__setitem__("T1",g) or g)[3].split(".")[1])(f.split("/",maxsplit=5)))' '(filename,(stash["T1"])[4][0]) - -fields=[((lambda x,y:(x,eval("lambda f:%s"%y,globals())))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields] -for l in sys.stdin: - (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) - ja=json.loads(jj) - #print(ja) - print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else - (f[1](ja[f[0]] if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields))) -
--- a/bin/uniq_merge.py Mon Jul 18 17:59:43 2022 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# Merge counts by key from the output of "uniq -c" (or sus) and sort in descending order -# An alternative to sus when the scale is too big for the initial sort, or if uniq -c already does a lot -# of the work -# Usage: ... | uniq -c | uniq-merge.py -import sys -from collections import defaultdict -s=defaultdict(int) -for l in sys.stdin: - (i,d)=l.split(maxsplit=1) - s[d]+=int(i) -for (d,n) in sorted(s.items(),key=lambda j:j[1],reverse=True): - sys.stdout.write('%5d\t%s'%(n,d))