69
|
1 from twitter.twitter import *
|
|
2 import cld2full
|
|
3
|
|
4 tt=xtwc.sents('20100128.txt')
|
|
5 corp=[]
|
|
6
|
|
7 for t in tt:
|
|
8 at=[w.lower() for w in t if w.isalpha()]
|
|
9 if len(at) >= 5:
|
|
10 corp.append((' '.join(at)).encode('utf8'))
|
|
11 len(corp)
|
|
12
|
|
13 rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable]
|
|
14 len(rcorp)
|
|
15
|
|
16 mecorp=[r for r in rcorp if r[0].details[0].language_code=='en']
|
|
17 len(mecorp)
|
|
18
|
|
19 eecorp=[r for r in mecorp if r[0].details[1].language_code=='un']
|
|
20 len(eecorp)
|
|
21
|
|
22 necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')]
|
|
23 len(necorp)
|
|
24
|