changeset 142:8af4e9937799

working, w. pickle
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Tue, 19 Oct 2021 12:57:50 +0000
parents 1e5f15a1e9fa
children ddff993994be
files bin/lang_by_seg.py
diffstat 1 files changed, 29 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/lang_by_seg.py	Tue Oct 19 12:57:50 2021 +0000
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+'''Replicate part of Jingrui MSc, tabulate a single index
+file info of language vs segment
+
+Borrows from cdx2tsv
+
+Usage: uz cdx... | lang_by_seg.py outfilename'''
+
+import sys, json, pickle
+from collections import defaultdict
+
+fn=sys.argv[1]
+
+with open(fn,'bw') as f:
+
+  segs=[defaultdict(int) for i in range(100)]
+
+  for l in sys.stdin:
+    (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
+    ja=json.loads(jj)
+    lang=ja.get('languages','NA')
+    seg=int(ja['filename'].split('/')[3].split('.')[1])
+    segs[seg][lang]+=1
+
+  #for i in range(100):
+  #  print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
+  #print([dict(seg) for seg in segs])
+  pickle.dump(segs,f)
+