Mercurial > hg > python
view classify_tweets.py @ 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | |
children |
line wrap: on
line source
from collections import defaultdict, Counter from typing import Tuple, List, Any, Set, Dict, Callable import numpy as np # for np.mean() and np.std() import nltk, sys, inspect import nltk.corpus.util from nltk import MaxentClassifier from nltk.corpus import brown, ppattach # import corpora # Import LgramModel from nltk_model import * # Import the Twitter corpus from twitter.twitter import * twitter_file_ids = "20100128.txt" assert twitter_file_ids in xtwc.fileids() def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel: """ Build a bigram letter language model using LgramModel based on the lower-cased all-alpha subset of the entire corpus :param corpus: An NLTK corpus :return: A padded letter bigram model based on nltk.model.NgramModel """ # subset the corpus to only include all-alpha tokens, # converted to lower-case (_after_ the all-alpha check) corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()] # Return the tokens and a smoothed (using the default estimator) # padded bigram letter language model return LgramModel(2, corpus_tokens, True, True) lm = train_LM(brown) def rtc(x): counter = 0 last_char = None o = [] for i, c in enumerate(x): if c == last_char: counter += 1 else: counter = 0 if counter < 2: o.append(c) last_char = c return "".join(o) import re def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool: """ Classify if the given tweet is written in English or not. :param bigram_model: the bigram letter model trained on the Brown corpus :param tweet: the tweet :return: True if the tweet is classified as English, False otherwise """ blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"} blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"] prepro = [] for token in tweet: token = rtc(token) if token in blacklist: continue elif any(re.fullmatch(regex, token) for regex in blacklist_regex): continue else: prepro.append(token) tweet = prepro return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) def boring_scoring(bigram_model, tweet): return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) def is_English(tweet,thresh=None): #return boring_scoring(lm, tweet) <= 3.85 #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold def get_tweets(fn: str,n): """ :rtype list(tuple(list(str), bool)) :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us, and a is True, if the tweet is in English, and False otherwise. """ f=open(fn) i=0 for l in f: if n>0 and i==n: break yield l.split() i+=1 f.close() def eval(n1=0,n2=0,thresh=None): fp=tp=fn=tn=0 np=nn=0 for tweet in get_tweets("twitter/etweets.txt",n1): np+=1 if is_English(tweet,thresh): tp+=1 else: fp+=1 for tweet in get_tweets("twitter/netweets.txt",n2): nn+=1 if is_English(tweet,thresh): fn+=1 else: tn+=1 print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1, 'all' if n2==0 else n2, 'default' if thresh==None else thresh)) print("%6s %6s %6s %6s"%('','right','wrong','total')) print("%6s %6d %6d %6d"%('pos',tp,fp,np)) print("%6s %6d %6d %6d"%('neg',tn,fn,nn)) print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn)) print("Accuracy: %g"%(float(tp+tn)/float(np+nn))) eval(100)