diff twt.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/twt.py	Fri Jan 17 15:45:26 2025 +0000
@@ -0,0 +1,24 @@
+from twitter.twitter import *
+import cld2full
+
+tt=xtwc.sents('20100128.txt')
+corp=[]
+
+for t in tt:
+ at=[w.lower() for w in t if w.isalpha()]
+ if len(at) >= 5:
+  corp.append((' '.join(at)).encode('utf8'))
+len(corp)
+
+rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable]
+len(rcorp)
+
+mecorp=[r for r in rcorp if r[0].details[0].language_code=='en']
+len(mecorp)
+
+eecorp=[r for r in mecorp if r[0].details[1].language_code=='un']
+len(eecorp)
+
+necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')]
+len(necorp)
+