comparison twt.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
comparison
equal deleted inserted replaced
68:eb91fd5d49b3 69:157f012ffab7
1 from twitter.twitter import *
2 import cld2full
3
4 tt=xtwc.sents('20100128.txt')
5 corp=[]
6
7 for t in tt:
8 at=[w.lower() for w in t if w.isalpha()]
9 if len(at) >= 5:
10 corp.append((' '.join(at)).encode('utf8'))
11 len(corp)
12
13 rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable]
14 len(rcorp)
15
16 mecorp=[r for r in rcorp if r[0].details[0].language_code=='en']
17 len(mecorp)
18
19 eecorp=[r for r in mecorp if r[0].details[1].language_code=='un']
20 len(eecorp)
21
22 necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')]
23 len(necorp)
24