annotate bin/lang_by_seg.py @ 142:8af4e9937799

working, w. pickle
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 19 Oct 2021 12:57:50 +0000
parents
children 2643a6825f17
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
142
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 #!/usr/bin/env python3
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 '''Replicate part of Jingrui MSc, tabulate a single index
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3 file info of language vs segment
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 Borrows from cdx2tsv
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 Usage: uz cdx... | lang_by_seg.py outfilename'''
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9 import sys, json, pickle
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 from collections import defaultdict
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12 fn=sys.argv[1]
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 with open(fn,'bw') as f:
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 segs=[defaultdict(int) for i in range(100)]
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18 for l in sys.stdin:
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19 (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 ja=json.loads(jj)
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 lang=ja.get('languages','NA')
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 seg=int(ja['filename'].split('/')[3].split('.')[1])
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 segs[seg][lang]+=1
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 #for i in range(100):
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26 # print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 #print([dict(seg) for seg in segs])
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 pickle.dump(segs,f)
8af4e9937799 working, w. pickle
Henry S. Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29