Mercurial > hg > python
comparison classify_tweets.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
68:eb91fd5d49b3 | 69:157f012ffab7 |
---|---|
1 from collections import defaultdict, Counter | |
2 from typing import Tuple, List, Any, Set, Dict, Callable | |
3 | |
4 import numpy as np # for np.mean() and np.std() | |
5 import nltk, sys, inspect | |
6 import nltk.corpus.util | |
7 from nltk import MaxentClassifier | |
8 from nltk.corpus import brown, ppattach # import corpora | |
9 | |
10 # Import LgramModel | |
11 from nltk_model import * | |
12 | |
13 # Import the Twitter corpus | |
14 from twitter.twitter import * | |
15 | |
16 twitter_file_ids = "20100128.txt" | |
17 assert twitter_file_ids in xtwc.fileids() | |
18 | |
19 | |
20 def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel: | |
21 """ | |
22 Build a bigram letter language model using LgramModel | |
23 based on the lower-cased all-alpha subset of the entire corpus | |
24 | |
25 :param corpus: An NLTK corpus | |
26 | |
27 :return: A padded letter bigram model based on nltk.model.NgramModel | |
28 """ | |
29 | |
30 # subset the corpus to only include all-alpha tokens, | |
31 # converted to lower-case (_after_ the all-alpha check) | |
32 corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()] | |
33 | |
34 # Return the tokens and a smoothed (using the default estimator) | |
35 # padded bigram letter language model | |
36 return LgramModel(2, corpus_tokens, True, True) | |
37 | |
38 | |
39 lm = train_LM(brown) | |
40 | |
41 def rtc(x): | |
42 counter = 0 | |
43 last_char = None | |
44 o = [] | |
45 for i, c in enumerate(x): | |
46 if c == last_char: | |
47 counter += 1 | |
48 else: | |
49 counter = 0 | |
50 if counter < 2: | |
51 o.append(c) | |
52 last_char = c | |
53 return "".join(o) | |
54 | |
55 | |
56 import re | |
57 def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool: | |
58 """ | |
59 Classify if the given tweet is written in English or not. | |
60 | |
61 :param bigram_model: the bigram letter model trained on the Brown corpus | |
62 :param tweet: the tweet | |
63 :return: True if the tweet is classified as English, False otherwise | |
64 """ | |
65 blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"} | |
66 blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"] | |
67 prepro = [] | |
68 for token in tweet: | |
69 token = rtc(token) | |
70 if token in blacklist: | |
71 continue | |
72 elif any(re.fullmatch(regex, token) for regex in blacklist_regex): | |
73 continue | |
74 else: | |
75 prepro.append(token) | |
76 tweet = prepro | |
77 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) | |
78 | |
79 def boring_scoring(bigram_model, tweet): | |
80 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) | |
81 | |
82 def is_English(tweet,thresh=None): | |
83 #return boring_scoring(lm, tweet) <= 3.85 | |
84 #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right | |
85 #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right | |
86 return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold | |
87 | |
88 def get_tweets(fn: str,n): | |
89 """ | |
90 :rtype list(tuple(list(str), bool)) | |
91 :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us, | |
92 and a is True, if the tweet is in English, and False otherwise. | |
93 """ | |
94 f=open(fn) | |
95 i=0 | |
96 for l in f: | |
97 if n>0 and i==n: | |
98 break | |
99 yield l.split() | |
100 i+=1 | |
101 f.close() | |
102 | |
103 def eval(n1=0,n2=0,thresh=None): | |
104 fp=tp=fn=tn=0 | |
105 np=nn=0 | |
106 for tweet in get_tweets("twitter/etweets.txt",n1): | |
107 np+=1 | |
108 if is_English(tweet,thresh): | |
109 tp+=1 | |
110 else: | |
111 fp+=1 | |
112 for tweet in get_tweets("twitter/netweets.txt",n2): | |
113 nn+=1 | |
114 if is_English(tweet,thresh): | |
115 fn+=1 | |
116 else: | |
117 tn+=1 | |
118 print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1, | |
119 'all' if n2==0 else n2, | |
120 'default' if thresh==None else thresh)) | |
121 print("%6s %6s %6s %6s"%('','right','wrong','total')) | |
122 print("%6s %6d %6d %6d"%('pos',tp,fp,np)) | |
123 print("%6s %6d %6d %6d"%('neg',tn,fn,nn)) | |
124 print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn)) | |
125 print("Accuracy: %g"%(float(tp+tn)/float(np+nn))) | |
126 | |
127 eval(100) | |
128 | |
129 | |
130 |