69
|
1 from collections import defaultdict, Counter
|
|
2 from typing import Tuple, List, Any, Set, Dict, Callable
|
|
3
|
|
4 import numpy as np # for np.mean() and np.std()
|
|
5 import nltk, sys, inspect
|
|
6 import nltk.corpus.util
|
|
7 from nltk import MaxentClassifier
|
|
8 from nltk.corpus import brown, ppattach # import corpora
|
|
9
|
|
10 # Import LgramModel
|
|
11 from nltk_model import *
|
|
12
|
|
13 # Import the Twitter corpus
|
|
14 from twitter.twitter import *
|
|
15
|
|
16 twitter_file_ids = "20100128.txt"
|
|
17 assert twitter_file_ids in xtwc.fileids()
|
|
18
|
|
19
|
|
20 def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
|
|
21 """
|
|
22 Build a bigram letter language model using LgramModel
|
|
23 based on the lower-cased all-alpha subset of the entire corpus
|
|
24
|
|
25 :param corpus: An NLTK corpus
|
|
26
|
|
27 :return: A padded letter bigram model based on nltk.model.NgramModel
|
|
28 """
|
|
29
|
|
30 # subset the corpus to only include all-alpha tokens,
|
|
31 # converted to lower-case (_after_ the all-alpha check)
|
|
32 corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]
|
|
33
|
|
34 # Return the tokens and a smoothed (using the default estimator)
|
|
35 # padded bigram letter language model
|
|
36 return LgramModel(2, corpus_tokens, True, True)
|
|
37
|
|
38
|
|
39 lm = train_LM(brown)
|
|
40
|
|
41 def rtc(x):
|
|
42 counter = 0
|
|
43 last_char = None
|
|
44 o = []
|
|
45 for i, c in enumerate(x):
|
|
46 if c == last_char:
|
|
47 counter += 1
|
|
48 else:
|
|
49 counter = 0
|
|
50 if counter < 2:
|
|
51 o.append(c)
|
|
52 last_char = c
|
|
53 return "".join(o)
|
|
54
|
|
55
|
|
56 import re
|
|
57 def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
|
|
58 """
|
|
59 Classify if the given tweet is written in English or not.
|
|
60
|
|
61 :param bigram_model: the bigram letter model trained on the Brown corpus
|
|
62 :param tweet: the tweet
|
|
63 :return: True if the tweet is classified as English, False otherwise
|
|
64 """
|
|
65 blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
|
|
66 blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
|
|
67 prepro = []
|
|
68 for token in tweet:
|
|
69 token = rtc(token)
|
|
70 if token in blacklist:
|
|
71 continue
|
|
72 elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
|
|
73 continue
|
|
74 else:
|
|
75 prepro.append(token)
|
|
76 tweet = prepro
|
|
77 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
|
|
78
|
|
79 def boring_scoring(bigram_model, tweet):
|
|
80 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
|
|
81
|
|
82 def is_English(tweet,thresh=None):
|
|
83 #return boring_scoring(lm, tweet) <= 3.85
|
|
84 #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
|
|
85 #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
|
|
86 return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold
|
|
87
|
|
88 def get_tweets(fn: str,n):
|
|
89 """
|
|
90 :rtype list(tuple(list(str), bool))
|
|
91 :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
|
|
92 and a is True, if the tweet is in English, and False otherwise.
|
|
93 """
|
|
94 f=open(fn)
|
|
95 i=0
|
|
96 for l in f:
|
|
97 if n>0 and i==n:
|
|
98 break
|
|
99 yield l.split()
|
|
100 i+=1
|
|
101 f.close()
|
|
102
|
|
103 def eval(n1=0,n2=0,thresh=None):
|
|
104 fp=tp=fn=tn=0
|
|
105 np=nn=0
|
|
106 for tweet in get_tweets("twitter/etweets.txt",n1):
|
|
107 np+=1
|
|
108 if is_English(tweet,thresh):
|
|
109 tp+=1
|
|
110 else:
|
|
111 fp+=1
|
|
112 for tweet in get_tweets("twitter/netweets.txt",n2):
|
|
113 nn+=1
|
|
114 if is_English(tweet,thresh):
|
|
115 fn+=1
|
|
116 else:
|
|
117 tn+=1
|
|
118 print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
|
|
119 'all' if n2==0 else n2,
|
|
120 'default' if thresh==None else thresh))
|
|
121 print("%6s %6s %6s %6s"%('','right','wrong','total'))
|
|
122 print("%6s %6d %6d %6d"%('pos',tp,fp,np))
|
|
123 print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
|
|
124 print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
|
|
125 print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))
|
|
126
|
|
127 eval(100)
|
|
128
|
|
129
|
|
130
|