Mercurial > hg > python
comparison twt.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
68:eb91fd5d49b3 | 69:157f012ffab7 |
---|---|
1 from twitter.twitter import * | |
2 import cld2full | |
3 | |
4 tt=xtwc.sents('20100128.txt') | |
5 corp=[] | |
6 | |
7 for t in tt: | |
8 at=[w.lower() for w in t if w.isalpha()] | |
9 if len(at) >= 5: | |
10 corp.append((' '.join(at)).encode('utf8')) | |
11 len(corp) | |
12 | |
13 rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable] | |
14 len(rcorp) | |
15 | |
16 mecorp=[r for r in rcorp if r[0].details[0].language_code=='en'] | |
17 len(mecorp) | |
18 | |
19 eecorp=[r for r in mecorp if r[0].details[1].language_code=='un'] | |
20 len(eecorp) | |
21 | |
22 necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')] | |
23 len(necorp) | |
24 |