Mercurial > hg > python
diff twt.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/twt.py Fri Jan 17 15:45:26 2025 +0000 @@ -0,0 +1,24 @@ +from twitter.twitter import * +import cld2full + +tt=xtwc.sents('20100128.txt') +corp=[] + +for t in tt: + at=[w.lower() for w in t if w.isalpha()] + if len(at) >= 5: + corp.append((' '.join(at)).encode('utf8')) +len(corp) + +rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable] +len(rcorp) + +mecorp=[r for r in rcorp if r[0].details[0].language_code=='en'] +len(mecorp) + +eecorp=[r for r in mecorp if r[0].details[1].language_code=='un'] +len(eecorp) + +necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')] +len(necorp) +