annotate classify_tweets.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
69
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
1 from collections import defaultdict, Counter
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
2 from typing import Tuple, List, Any, Set, Dict, Callable
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
3
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
4 import numpy as np # for np.mean() and np.std()
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
5 import nltk, sys, inspect
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
6 import nltk.corpus.util
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
7 from nltk import MaxentClassifier
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
8 from nltk.corpus import brown, ppattach # import corpora
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
9
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
10 # Import LgramModel
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
11 from nltk_model import *
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
12
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
13 # Import the Twitter corpus
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
14 from twitter.twitter import *
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
15
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
16 twitter_file_ids = "20100128.txt"
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
17 assert twitter_file_ids in xtwc.fileids()
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
18
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
19
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
20 def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
21 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
22 Build a bigram letter language model using LgramModel
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
23 based on the lower-cased all-alpha subset of the entire corpus
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
24
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
25 :param corpus: An NLTK corpus
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
26
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
27 :return: A padded letter bigram model based on nltk.model.NgramModel
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
28 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
29
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
30 # subset the corpus to only include all-alpha tokens,
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
31 # converted to lower-case (_after_ the all-alpha check)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
32 corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
33
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
34 # Return the tokens and a smoothed (using the default estimator)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
35 # padded bigram letter language model
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
36 return LgramModel(2, corpus_tokens, True, True)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
37
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
38
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
39 lm = train_LM(brown)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
40
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
41 def rtc(x):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
42 counter = 0
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
43 last_char = None
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
44 o = []
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
45 for i, c in enumerate(x):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
46 if c == last_char:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
47 counter += 1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
48 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
49 counter = 0
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
50 if counter < 2:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
51 o.append(c)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
52 last_char = c
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
53 return "".join(o)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
54
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
55
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
56 import re
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
57 def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
58 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
59 Classify if the given tweet is written in English or not.
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
60
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
61 :param bigram_model: the bigram letter model trained on the Brown corpus
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
62 :param tweet: the tweet
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
63 :return: True if the tweet is classified as English, False otherwise
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
64 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
65 blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
66 blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
67 prepro = []
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
68 for token in tweet:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
69 token = rtc(token)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
70 if token in blacklist:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
71 continue
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
72 elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
73 continue
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
74 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
75 prepro.append(token)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
76 tweet = prepro
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
77 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
78
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
79 def boring_scoring(bigram_model, tweet):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
80 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
81
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
82 def is_English(tweet,thresh=None):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
83 #return boring_scoring(lm, tweet) <= 3.85
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
84 #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
85 #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
86 return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
87
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
88 def get_tweets(fn: str,n):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
89 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
90 :rtype list(tuple(list(str), bool))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
91 :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
92 and a is True, if the tweet is in English, and False otherwise.
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
93 """
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
94 f=open(fn)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
95 i=0
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
96 for l in f:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
97 if n>0 and i==n:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
98 break
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
99 yield l.split()
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
100 i+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
101 f.close()
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
102
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
103 def eval(n1=0,n2=0,thresh=None):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
104 fp=tp=fn=tn=0
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
105 np=nn=0
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
106 for tweet in get_tweets("twitter/etweets.txt",n1):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
107 np+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
108 if is_English(tweet,thresh):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
109 tp+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
110 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
111 fp+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
112 for tweet in get_tweets("twitter/netweets.txt",n2):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
113 nn+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
114 if is_English(tweet,thresh):
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
115 fn+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
116 else:
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
117 tn+=1
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
118 print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
119 'all' if n2==0 else n2,
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
120 'default' if thresh==None else thresh))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
121 print("%6s %6s %6s %6s"%('','right','wrong','total'))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
122 print("%6s %6d %6d %6d"%('pos',tp,fp,np))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
123 print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
124 print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
125 print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
126
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
127 eval(100)
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
128
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
129
157f012ffab7 from local
Henry S Thompson <ht@inf.ed.ac.uk>
parents:
diff changeset
130