Mercurial > hg > cc > cirrus_home
view bin/lang_by_seg.py @ 153:2b59f3ef2294
add -c switch to btot
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 01 Nov 2021 21:23:13 +0000 |
parents | 8af4e9937799 |
children | 2643a6825f17 |
line wrap: on
line source
#!/usr/bin/env python3 '''Replicate part of Jingrui MSc, tabulate a single index file info of language vs segment Borrows from cdx2tsv Usage: uz cdx... | lang_by_seg.py outfilename''' import sys, json, pickle from collections import defaultdict fn=sys.argv[1] with open(fn,'bw') as f: segs=[defaultdict(int) for i in range(100)] for l in sys.stdin: (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) ja=json.loads(jj) lang=ja.get('languages','NA') seg=int(ja['filename'].split('/')[3].split('.')[1]) segs[seg][lang]+=1 #for i in range(100): # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) #print([dict(seg) for seg in segs]) pickle.dump(segs,f)