Mercurial > hg > cc > cirrus_home
changeset 142:8af4e9937799
working, w. pickle
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 19 Oct 2021 12:57:50 +0000 |
parents | 1e5f15a1e9fa |
children | ddff993994be |
files | bin/lang_by_seg.py |
diffstat | 1 files changed, 29 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/lang_by_seg.py Tue Oct 19 12:57:50 2021 +0000 @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +'''Replicate part of Jingrui MSc, tabulate a single index +file info of language vs segment + +Borrows from cdx2tsv + +Usage: uz cdx... | lang_by_seg.py outfilename''' + +import sys, json, pickle +from collections import defaultdict + +fn=sys.argv[1] + +with open(fn,'bw') as f: + + segs=[defaultdict(int) for i in range(100)] + + for l in sys.stdin: + (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) + ja=json.loads(jj) + lang=ja.get('languages','NA') + seg=int(ja['filename'].split('/')[3].split('.')[1]) + segs[seg][lang]+=1 + + #for i in range(100): + # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) + #print([dict(seg) for seg in segs]) + pickle.dump(segs,f) +