view bin/lang_by_seg.py @ 146:c8e41c543c0b

works for 0--9
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Fri, 22 Oct 2021 12:36:15 +0000
parents 8af4e9937799
children 2643a6825f17
line wrap: on
line source

#!/usr/bin/env python3
'''Replicate part of Jingrui MSc, tabulate a single index
file info of language vs segment

Borrows from cdx2tsv

Usage: uz cdx... | lang_by_seg.py outfilename'''

import sys, json, pickle
from collections import defaultdict

fn=sys.argv[1]

with open(fn,'bw') as f:

  segs=[defaultdict(int) for i in range(100)]

  for l in sys.stdin:
    (key,stamp,jj)=l.rstrip().split(' ',maxsplit=2)
    ja=json.loads(jj)
    lang=ja.get('languages','NA')
    seg=int(ja['filename'].split('/')[3].split('.')[1])
    segs[seg][lang]+=1

  #for i in range(100):
  #  print(i,sorted(segs[i].items(),reverse=True,key=lambda x:x[1]))
  #print([dict(seg) for seg in segs])
  pickle.dump(segs,f)