Mercurial > hg > cc > cirrus_home
comparison bin/lang_by_seg.py @ 142:8af4e9937799
working, w. pickle
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Tue, 19 Oct 2021 12:57:50 +0000 |
parents | |
children | 2643a6825f17 |
comparison
equal
deleted
inserted
replaced
141:1e5f15a1e9fa | 142:8af4e9937799 |
---|---|
1 #!/usr/bin/env python3 | |
2 '''Replicate part of Jingrui MSc, tabulate a single index | |
3 file info of language vs segment | |
4 | |
5 Borrows from cdx2tsv | |
6 | |
7 Usage: uz cdx... | lang_by_seg.py outfilename''' | |
8 | |
9 import sys, json, pickle | |
10 from collections import defaultdict | |
11 | |
12 fn=sys.argv[1] | |
13 | |
14 with open(fn,'bw') as f: | |
15 | |
16 segs=[defaultdict(int) for i in range(100)] | |
17 | |
18 for l in sys.stdin: | |
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2) | |
20 ja=json.loads(jj) | |
21 lang=ja.get('languages','NA') | |
22 seg=int(ja['filename'].split('/')[3].split('.')[1]) | |
23 segs[seg][lang]+=1 | |
24 | |
25 #for i in range(100): | |
26 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1])) | |
27 #print([dict(seg) for seg in segs]) | |
28 pickle.dump(segs,f) | |
29 |