# HG changeset patch # User Henry S. Thompson # Date 1634648270 0 # Node ID 8af4e9937799b82febcb2a1bc9901f09ea5647aa # Parent 1e5f15a1e9fa98d58964c027886575a6ead07e48 working, w. pickle diff -r 1e5f15a1e9fa -r 8af4e9937799 bin/lang_by_seg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/lang_by_seg.py Tue Oct 19 12:57:50 2021 +0000 @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +'''Replicate part of Jingrui MSc, tabulate a single index +file info of language vs segment + +Borrows from cdx2tsv + +Usage: uz cdx... | lang_by_seg.py outfilename''' + +import sys, json, pickle +from collections import defaultdict + +fn=sys.argv[1] + +with open(fn,'bw') as f: + + segs=[defaultdict(int) for i in range(100)] + + for l in sys.stdin: + (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) + ja=json.loads(jj) + lang=ja.get('languages','NA') + seg=int(ja['filename'].split('/')[3].split('.')[1]) + segs[seg][lang]+=1 + + #for i in range(100): + # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) + #print([dict(seg) for seg in segs]) + pickle.dump(segs,f) +