142
|
1 #!/usr/bin/env python3
|
|
2 '''Replicate part of Jingrui MSc, tabulate a single index
|
|
3 file info of language vs segment
|
|
4
|
|
5 Borrows from cdx2tsv
|
|
6
|
|
7 Usage: uz cdx... | lang_by_seg.py outfilename'''
|
|
8
|
|
9 import sys, json, pickle
|
|
10 from collections import defaultdict
|
|
11
|
|
12 fn=sys.argv[1]
|
|
13
|
|
14 with open(fn,'bw') as f:
|
|
15
|
|
16 segs=[defaultdict(int) for i in range(100)]
|
|
17
|
|
18 for l in sys.stdin:
|
|
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
|
|
20 ja=json.loads(jj)
|
|
21 lang=ja.get('languages','NA')
|
|
22 seg=int(ja['filename'].split('/')[3].split('.')[1])
|
|
23 segs[seg][lang]+=1
|
|
24
|
|
25 #for i in range(100):
|
|
26 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
|
|
27 #print([dict(seg) for seg in segs])
|
|
28 pickle.dump(segs,f)
|
|
29
|