python: classify_tweets.py annotate

annotate classify_tweets.py @ 69:157f012ffab7 default tip

from local

author	Henry S Thompson <ht@inf.ed.ac.uk>
date	Fri, 17 Jan 2025 15:45:26 +0000
parents
children

rev	line source
69 157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	1 from collections import defaultdict, Counter
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	2 from typing import Tuple, List, Any, Set, Dict, Callable
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	3
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	4 import numpy as np # for np.mean() and np.std()
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	5 import nltk, sys, inspect
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	6 import nltk.corpus.util
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	7 from nltk import MaxentClassifier
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	8 from nltk.corpus import brown, ppattach # import corpora
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	9
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	10 # Import LgramModel
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	11 from nltk_model import *
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	12
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	13 # Import the Twitter corpus
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	14 from twitter.twitter import *
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	15
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	16 twitter_file_ids = "20100128.txt"
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	17 assert twitter_file_ids in xtwc.fileids()
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	18
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	19
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	20 def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	21 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	22 Build a bigram letter language model using LgramModel
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	23 based on the lower-cased all-alpha subset of the entire corpus
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	24
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	25 :param corpus: An NLTK corpus
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	26
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	27 :return: A padded letter bigram model based on nltk.model.NgramModel
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	28 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	29
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	30 # subset the corpus to only include all-alpha tokens,
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	31 # converted to lower-case (_after_ the all-alpha check)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	32 corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	33
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	34 # Return the tokens and a smoothed (using the default estimator)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	35 # padded bigram letter language model
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	36 return LgramModel(2, corpus_tokens, True, True)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	37
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	38
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	39 lm = train_LM(brown)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	40
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	41 def rtc(x):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	42 counter = 0
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	43 last_char = None
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	44 o = []
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	45 for i, c in enumerate(x):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	46 if c == last_char:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	47 counter += 1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	48 else:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	49 counter = 0
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	50 if counter < 2:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	51 o.append(c)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	52 last_char = c
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	53 return "".join(o)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	54
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	55
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	56 import re
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	57 def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	58 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	59 Classify if the given tweet is written in English or not.
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	60
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	61 :param bigram_model: the bigram letter model trained on the Brown corpus
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	62 :param tweet: the tweet
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	63 :return: True if the tweet is classified as English, False otherwise
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	64 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	65 blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	66 blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	67 prepro = []
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	68 for token in tweet:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	69 token = rtc(token)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	70 if token in blacklist:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	71 continue
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	72 elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	73 continue
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	74 else:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	75 prepro.append(token)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	76 tweet = prepro
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	77 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	78
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	79 def boring_scoring(bigram_model, tweet):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	80 return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	81
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	82 def is_English(tweet,thresh=None):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	83 #return boring_scoring(lm, tweet) <= 3.85
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	84 #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	85 #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	86 return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	87
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	88 def get_tweets(fn: str,n):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	89 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	90 :rtype list(tuple(list(str), bool))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	91 :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	92 and a is True, if the tweet is in English, and False otherwise.
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	93 """
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	94 f=open(fn)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	95 i=0
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	96 for l in f:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	97 if n>0 and i==n:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	98 break
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	99 yield l.split()
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	100 i+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	101 f.close()
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	102
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	103 def eval(n1=0,n2=0,thresh=None):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	104 fp=tp=fn=tn=0
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	105 np=nn=0
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	106 for tweet in get_tweets("twitter/etweets.txt",n1):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	107 np+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	108 if is_English(tweet,thresh):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	109 tp+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	110 else:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	111 fp+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	112 for tweet in get_tweets("twitter/netweets.txt",n2):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	113 nn+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	114 if is_English(tweet,thresh):
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	115 fn+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	116 else:
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	117 tn+=1
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	118 print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	119 'all' if n2==0 else n2,
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	120 'default' if thresh==None else thresh))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	121 print("%6s %6s %6s %6s"%('','right','wrong','total'))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	122 print("%6s %6d %6d %6d"%('pos',tp,fp,np))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	123 print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	124 print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	125 print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	126
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	127 eval(100)
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	128
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	129
157f012ffab7 from local Henry S Thompson <ht@inf.ed.ac.uk> parents: diff changeset	130

Mercurial > hg > python

annotate classify_tweets.py @ 69:157f012ffab7 default tip