changeset 162:e82981075b4a

moved to shared/bin
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 18 Jul 2022 18:11:46 +0100
parents df56132ef84a
children ef961d91eea5
files bin/cdx2tsv.py bin/uniq_merge.py
diffstat 2 files changed, 0 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
--- a/bin/cdx2tsv.py	Mon Jul 18 17:59:43 2022 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-'''Extract named fields, with optional post-processing, in order,
-                         from a Common Crawl index row
-
-   Field specs on command line are either an atom or
-                                             a tuple of atom and expression with
-                                             free variable f
-   For example
-    > cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
-   will output media type and URI scheme'''
-import json,sys
-
-if len(sys.argv)==1 or sys.argv[1][1]=='-':
-  print("""Reads index lines from stdin and extracts values from json dict part
-
-  Usage: cdx2tsv.py fieldspecs...
-
-  fieldspec is either a name or a quoted python tuple of a name and
-    an expression with free variable f which will be evaluated with f
-    having the field value.
-
-  For example 
-    cdx2tsv.py mime '(url,f.split(":",maxsplit=1)[0])' < xyzzy.cdx
-  will output media type and URI scheme""",
-        file=sys.stderr)
-  exit(1)
-
-fields=sys.argv[1:]
-
-stash={} # This can be used to save an expensive field computation for re-use:
-         # E.g. '(filename,(lambda g:(stash.__setitem__("T1",g) or g)[3].split(".")[1])(f.split("/",maxsplit=5)))'  '(filename,(stash["T1"])[4][0])
-
-fields=[((lambda x,y:(x,eval("lambda f:%s"%y,globals())))(*(f[1:-1].split(',',maxsplit=1))) if f[0]=='(' else f) for f in fields]
-for l in sys.stdin:
-  (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
-  ja=json.loads(jj)
-  #print(ja)
-  print('\t'.join((ja.get(f,'NA') if isinstance(f,str) else
-                   (f[1](ja[f[0]] if f[0] in ja else 'NA'))) for f in (ja.keys() if fields==["*"] else fields)))
-
--- a/bin/uniq_merge.py	Mon Jul 18 17:59:43 2022 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-# Merge counts by key from the output of "uniq -c" (or sus) and sort in descending order
-# An alternative to sus when the scale is too big for the initial sort, or if uniq -c already does a lot
-#  of the work
-# Usage: ... | uniq -c | uniq-merge.py
-import sys
-from collections import defaultdict
-s=defaultdict(int)
-for l in sys.stdin:
- (i,d)=l.split(maxsplit=1)
- s[d]+=int(i)
-for (d,n) in sorted(s.items(),key=lambda j:j[1],reverse=True):
- sys.stdout.write('%5d\t%s'%(n,d))