Mercurial > hg > python
changeset 69:157f012ffab7 default tip
from local
author | Henry S Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Fri, 17 Jan 2025 15:45:26 +0000 |
parents | eb91fd5d49b3 |
children | |
files | classify_tweets.py plinks_jto.py twt.py |
diffstat | 3 files changed, 225 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/classify_tweets.py Fri Jan 17 15:45:26 2025 +0000 @@ -0,0 +1,130 @@ +from collections import defaultdict, Counter +from typing import Tuple, List, Any, Set, Dict, Callable + +import numpy as np # for np.mean() and np.std() +import nltk, sys, inspect +import nltk.corpus.util +from nltk import MaxentClassifier +from nltk.corpus import brown, ppattach # import corpora + +# Import LgramModel +from nltk_model import * + +# Import the Twitter corpus +from twitter.twitter import * + +twitter_file_ids = "20100128.txt" +assert twitter_file_ids in xtwc.fileids() + + +def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel: + """ + Build a bigram letter language model using LgramModel + based on the lower-cased all-alpha subset of the entire corpus + + :param corpus: An NLTK corpus + + :return: A padded letter bigram model based on nltk.model.NgramModel + """ + + # subset the corpus to only include all-alpha tokens, + # converted to lower-case (_after_ the all-alpha check) + corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()] + + # Return the tokens and a smoothed (using the default estimator) + # padded bigram letter language model + return LgramModel(2, corpus_tokens, True, True) + + +lm = train_LM(brown) + +def rtc(x): + counter = 0 + last_char = None + o = [] + for i, c in enumerate(x): + if c == last_char: + counter += 1 + else: + counter = 0 + if counter < 2: + o.append(c) + last_char = c + return "".join(o) + + +import re +def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool: + """ + Classify if the given tweet is written in English or not. + + :param bigram_model: the bigram letter model trained on the Brown corpus + :param tweet: the tweet + :return: True if the tweet is classified as English, False otherwise + """ + blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"} + blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"] + prepro = [] + for token in tweet: + token = rtc(token) + if token in blacklist: + continue + elif any(re.fullmatch(regex, token) for regex in blacklist_regex): + continue + else: + prepro.append(token) + tweet = prepro + return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) + +def boring_scoring(bigram_model, tweet): + return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet) + +def is_English(tweet,thresh=None): + #return boring_scoring(lm, tweet) <= 3.85 + #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right + #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right + return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold + +def get_tweets(fn: str,n): + """ + :rtype list(tuple(list(str), bool)) + :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us, + and a is True, if the tweet is in English, and False otherwise. + """ + f=open(fn) + i=0 + for l in f: + if n>0 and i==n: + break + yield l.split() + i+=1 + f.close() + +def eval(n1=0,n2=0,thresh=None): + fp=tp=fn=tn=0 + np=nn=0 + for tweet in get_tweets("twitter/etweets.txt",n1): + np+=1 + if is_English(tweet,thresh): + tp+=1 + else: + fp+=1 + for tweet in get_tweets("twitter/netweets.txt",n2): + nn+=1 + if is_English(tweet,thresh): + fn+=1 + else: + tn+=1 + print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1, + 'all' if n2==0 else n2, + 'default' if thresh==None else thresh)) + print("%6s %6s %6s %6s"%('','right','wrong','total')) + print("%6s %6d %6d %6d"%('pos',tp,fp,np)) + print("%6s %6d %6d %6d"%('neg',tn,fn,nn)) + print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn)) + print("Accuracy: %g"%(float(tp+tn)/float(np+nn))) + +eval(100) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plinks_jto.py Fri Jan 17 15:45:26 2025 +0000 @@ -0,0 +1,71 @@ +#!/usr/bin/python3 +# Needs PYTHONPATH=/group/ltg/projects/lcontrib_sl7/usr/lib/python3.4/site-packages +import sys,pdfx,traceback + +import types +if not isinstance(getattr(pdfx.backends.Reference,'__lt__'),types.FunctionType): + def __lt__(self,other): + assert isinstance(other, pdfx.backends.Reference) + return self.ref < other.ref + + pdfx.backends.Reference.__lt__=__lt__ + +E=None + +def run(): + global pdf, limited + if sys.argv[1]=='-f': + # flatten + flatten=True + sys.argv.pop(1) + else: + flatten=False + try: + pdf=pdfx.PDFx(sys.argv[1],**limited) + if flatten: + links=pdf.get_references(sort=True) + else: + links=pdf.get_references_as_dict(sort=True) + except: + traceback.print_exc() + print("\nFailed: %s"%sys.argv[1],E,file=sys.stderr) + exit(1) + if pdf.limited: + print("Timed out, no text or scraping",file=sys.stderr) + if flatten: + for l in links: + print(l) + else: + for k in links.keys(): + for l in links[k]: + print("%s\t%s"%(k,l)) + +limited={} +if sys.argv[1]=='-v': + # verbose: log level debug + sys.argv.pop(1) + import logging + logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s', + datefmt='%m/%d/%Y %I:%M:%S %p') + logging.getLogger(name='stopit').addHandler(logging.StreamHandler(sys.stderr)) + logging.getLogger(name='pdfx').addHandler(logging.StreamHandler(sys.stderr)) + logging.getLogger(name='pdfminer').setLevel(logging.WARN) + +if sys.argv[1]=='-r': + # timeout for reading + sys.argv.pop(1) + limited['readTimeout']=float(sys.argv.pop(1)) +if sys.argv[1]=='-t': + # timeout for text recovery + sys.argv.pop(1) + limited['textTimeout']=float(sys.argv.pop(1)) + +if sys.argv[1]=='-x': + import timeit + sys.argv.pop(1) + n=sys.argv[1] + sys.argv.pop(1) + print(timeit.timeit("run()",number=int(n), + setup="from __main__ import run"),file=sys.stderr) +else: + run()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/twt.py Fri Jan 17 15:45:26 2025 +0000 @@ -0,0 +1,24 @@ +from twitter.twitter import * +import cld2full + +tt=xtwc.sents('20100128.txt') +corp=[] + +for t in tt: + at=[w.lower() for w in t if w.isalpha()] + if len(at) >= 5: + corp.append((' '.join(at)).encode('utf8')) +len(corp) + +rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable] +len(rcorp) + +mecorp=[r for r in rcorp if r[0].details[0].language_code=='en'] +len(mecorp) + +eecorp=[r for r in mecorp if r[0].details[1].language_code=='un'] +len(eecorp) + +necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')] +len(necorp) +