python: classify_tweets.py comparison

comparison classify_tweets.py @ 69:157f012ffab7 default tip

from local

author	Henry S Thompson <ht@inf.ed.ac.uk>
date	Fri, 17 Jan 2025 15:45:26 +0000
parents
children

comparison

equal deleted inserted replaced

-:eb91fd5d49b3
+:157f012ffab7
+from collections import defaultdict, Counter
+from typing import Tuple, List, Any, Set, Dict, Callable
+import numpy as np  # for np.mean() and np.std()
+import nltk, sys, inspect
+import nltk.corpus.util
+from nltk import MaxentClassifier
+from nltk.corpus import brown, ppattach  # import corpora
+# Import LgramModel
+from nltk_model import *
+# Import the Twitter corpus
+from twitter.twitter import *
+twitter_file_ids = "20100128.txt"
+assert twitter_file_ids in xtwc.fileids()
+def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
+"""
+Build a bigram letter language model using LgramModel
+based on the lower-cased all-alpha subset of the entire corpus
+:param corpus: An NLTK corpus
+:return: A padded letter bigram model based on nltk.model.NgramModel
+"""
+# subset the corpus to only include all-alpha tokens,
+# converted to lower-case (_after_ the all-alpha check)
+corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]
+# Return the tokens and a smoothed (using the default estimator)
+#   padded bigram letter language model
+return LgramModel(2, corpus_tokens, True, True)
+lm = train_LM(brown)
+def rtc(x):
+counter = 0
+last_char = None
+o = []
+for i, c in enumerate(x):
+if c == last_char:
+counter += 1
+else:
+counter = 0
+if counter < 2:
+o.append(c)
+last_char = c
+return "".join(o)
+import re
+def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
+"""
+Classify if the given tweet is written in English or not.
+:param bigram_model: the bigram letter model trained on the Brown corpus
+:param tweet: the tweet
+:return: True if the tweet is classified as English, False otherwise
+"""
+blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
+blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
+prepro = []
+for token in tweet:
+token = rtc(token)
+if token in blacklist:
+continue
+elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
+continue
+else:
+prepro.append(token)
+tweet = prepro
+return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
+def boring_scoring(bigram_model, tweet):
+return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
+def is_English(tweet,thresh=None):
+#return boring_scoring(lm, tweet) <= 3.85
+#return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
+#return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
+return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold
+def get_tweets(fn: str,n):
+"""
+:rtype list(tuple(list(str), bool))
+:return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
+and a is True, if the tweet is in English, and False otherwise.
+"""
+f=open(fn)
+i=0
+for l in f:
+if n>0 and i==n:
+break
+yield l.split()
+i+=1
+f.close()
+def eval(n1=0,n2=0,thresh=None):
+fp=tp=fn=tn=0
+np=nn=0
+for tweet in get_tweets("twitter/etweets.txt",n1):
+np+=1
+if is_English(tweet,thresh):
+tp+=1
+else:
+fp+=1
+for tweet in get_tweets("twitter/netweets.txt",n2):
+nn+=1
+if is_English(tweet,thresh):
+fn+=1
+else:
+tn+=1
+print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
+'all' if n2==0 else n2,
+'default' if thresh==None else thresh))
+print("%6s %6s %6s %6s"%('','right','wrong','total'))
+print("%6s %6d %6d %6d"%('pos',tp,fp,np))
+print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
+print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
+print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))
+eval(100)

Mercurial > hg > python

comparison classify_tweets.py @ 69:157f012ffab7 default tip