view classify_tweets.py @ 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents
children
line wrap: on
line source

from collections import defaultdict, Counter
from typing import Tuple, List, Any, Set, Dict, Callable

import numpy as np  # for np.mean() and np.std()
import nltk, sys, inspect
import nltk.corpus.util
from nltk import MaxentClassifier
from nltk.corpus import brown, ppattach  # import corpora

# Import LgramModel
from nltk_model import *

# Import the Twitter corpus
from twitter.twitter import *

twitter_file_ids = "20100128.txt"
assert twitter_file_ids in xtwc.fileids()


def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
    """
    Build a bigram letter language model using LgramModel
    based on the lower-cased all-alpha subset of the entire corpus

    :param corpus: An NLTK corpus

    :return: A padded letter bigram model based on nltk.model.NgramModel
    """

    # subset the corpus to only include all-alpha tokens,
    # converted to lower-case (_after_ the all-alpha check)
    corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]

    # Return the tokens and a smoothed (using the default estimator)
    #   padded bigram letter language model
    return LgramModel(2, corpus_tokens, True, True)


lm = train_LM(brown)

def rtc(x):
    counter = 0
    last_char = None
    o = []
    for i, c in enumerate(x):
        if c == last_char:
            counter += 1
        else:
            counter = 0
        if counter < 2:
            o.append(c)
        last_char = c
    return "".join(o)


import re
def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
    """
    Classify if the given tweet is written in English or not.

    :param bigram_model: the bigram letter model trained on the Brown corpus
    :param tweet: the tweet
    :return: True if the tweet is classified as English, False otherwise
    """
    blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
    blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
    prepro = []
    for token in tweet:
        token = rtc(token)
        if token in blacklist:
            continue
        elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
            continue
        else:
            prepro.append(token)
    tweet = prepro
    return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)

def boring_scoring(bigram_model, tweet):
    return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)

def is_English(tweet,thresh=None):
    #return boring_scoring(lm, tweet) <= 3.85
    #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
    #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
    return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold

def get_tweets(fn: str,n):
    """
    :rtype list(tuple(list(str), bool))
    :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
    and a is True, if the tweet is in English, and False otherwise.
    """
    f=open(fn)
    i=0
    for l in f:
        if n>0 and i==n:
            break
        yield l.split()
        i+=1
    f.close()

def eval(n1=0,n2=0,thresh=None):
    fp=tp=fn=tn=0
    np=nn=0
    for tweet in get_tweets("twitter/etweets.txt",n1):
        np+=1
        if is_English(tweet,thresh):
            tp+=1
        else:
            fp+=1
    for tweet in get_tweets("twitter/netweets.txt",n2):
        nn+=1
        if is_English(tweet,thresh):
            fn+=1
        else:
            tn+=1
    print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
                                                    'all' if n2==0 else n2,
                                              'default' if thresh==None else thresh))
    print("%6s %6s %6s %6s"%('','right','wrong','total'))
    print("%6s %6d %6d %6d"%('pos',tp,fp,np))
    print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
    print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
    print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))

eval(100)