changeset 69:157f012ffab7 default tip

from local
author Henry S Thompson <ht@inf.ed.ac.uk>
date Fri, 17 Jan 2025 15:45:26 +0000
parents eb91fd5d49b3
children
files classify_tweets.py plinks_jto.py twt.py
diffstat 3 files changed, 225 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/classify_tweets.py	Fri Jan 17 15:45:26 2025 +0000
@@ -0,0 +1,130 @@
+from collections import defaultdict, Counter
+from typing import Tuple, List, Any, Set, Dict, Callable
+
+import numpy as np  # for np.mean() and np.std()
+import nltk, sys, inspect
+import nltk.corpus.util
+from nltk import MaxentClassifier
+from nltk.corpus import brown, ppattach  # import corpora
+
+# Import LgramModel
+from nltk_model import *
+
+# Import the Twitter corpus
+from twitter.twitter import *
+
+twitter_file_ids = "20100128.txt"
+assert twitter_file_ids in xtwc.fileids()
+
+
+def train_LM(corpus: nltk.corpus.CorpusReader) -> LgramModel:
+    """
+    Build a bigram letter language model using LgramModel
+    based on the lower-cased all-alpha subset of the entire corpus
+
+    :param corpus: An NLTK corpus
+
+    :return: A padded letter bigram model based on nltk.model.NgramModel
+    """
+
+    # subset the corpus to only include all-alpha tokens,
+    # converted to lower-case (_after_ the all-alpha check)
+    corpus_tokens = [word.lower() for word in corpus.words() if word.isalpha()]
+
+    # Return the tokens and a smoothed (using the default estimator)
+    #   padded bigram letter language model
+    return LgramModel(2, corpus_tokens, True, True)
+
+
+lm = train_LM(brown)
+
+def rtc(x):
+    counter = 0
+    last_char = None
+    o = []
+    for i, c in enumerate(x):
+        if c == last_char:
+            counter += 1
+        else:
+            counter = 0
+        if counter < 2:
+            o.append(c)
+        last_char = c
+    return "".join(o)
+
+
+import re
+def scoring_f(bigram_model: LgramModel, tweet: List[str]) -> bool:
+    """
+    Classify if the given tweet is written in English or not.
+
+    :param bigram_model: the bigram letter model trained on the Brown corpus
+    :param tweet: the tweet
+    :return: True if the tweet is classified as English, False otherwise
+    """
+    blacklist = {"rt", "smh", "hbu", "idk", "afaik","imho", "irl"}
+    blacklist_regex = ["lo+l+", "a+w+", "ya+y+", "o+m+g+","lma+o+"]
+    prepro = []
+    for token in tweet:
+        token = rtc(token)
+        if token in blacklist:
+            continue
+        elif any(re.fullmatch(regex, token) for regex in blacklist_regex):
+            continue
+        else:
+            prepro.append(token)
+    tweet = prepro
+    return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
+
+def boring_scoring(bigram_model, tweet):
+    return sum([bigram_model.entropy(token, perItem=True) for token in tweet]) / len(tweet)
+
+def is_English(tweet,thresh=None):
+    #return boring_scoring(lm, tweet) <= 3.85
+    #return boring_scoring(lm, tweet) <= 3.31 # lower threshold needed to get dev data right
+    #return boring_scoring(lm, tweet) < 6.182 # upper threshold needed to get dev data right
+    return scoring_f(lm, tweet) < (3.85 if thresh == None else thresh) # well-tuned threshold
+
+def get_tweets(fn: str,n):
+    """
+    :rtype list(tuple(list(str), bool))
+    :return: a list of tuples (tweet, a) where tweet is a tweet preprocessed by us,
+    and a is True, if the tweet is in English, and False otherwise.
+    """
+    f=open(fn)
+    i=0
+    for l in f:
+        if n>0 and i==n:
+            break
+        yield l.split()
+        i+=1
+    f.close()
+
+def eval(n1=0,n2=0,thresh=None):
+    fp=tp=fn=tn=0
+    np=nn=0
+    for tweet in get_tweets("twitter/etweets.txt",n1):
+        np+=1
+        if is_English(tweet,thresh):
+            tp+=1
+        else:
+            fp+=1
+    for tweet in get_tweets("twitter/netweets.txt",n2):
+        nn+=1
+        if is_English(tweet,thresh):
+            fn+=1
+        else:
+            tn+=1
+    print("Testing on %s/%s tweets, threshhold %s"%('all' if n1==0 else n1,
+                                                    'all' if n2==0 else n2,
+                                              'default' if thresh==None else thresh))
+    print("%6s %6s %6s %6s"%('','right','wrong','total'))
+    print("%6s %6d %6d %6d"%('pos',tp,fp,np))
+    print("%6s %6d %6d %6d"%('neg',tn,fn,nn))
+    print("%6s %6d %6d %6d"%('tot',tp+tn,fp+fn,np+nn))
+    print("Accuracy: %g"%(float(tp+tn)/float(np+nn)))
+
+eval(100)
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plinks_jto.py	Fri Jan 17 15:45:26 2025 +0000
@@ -0,0 +1,71 @@
+#!/usr/bin/python3
+# Needs PYTHONPATH=/group/ltg/projects/lcontrib_sl7/usr/lib/python3.4/site-packages
+import sys,pdfx,traceback
+
+import types
+if not isinstance(getattr(pdfx.backends.Reference,'__lt__'),types.FunctionType):
+  def __lt__(self,other):
+    assert isinstance(other, pdfx.backends.Reference)
+    return self.ref < other.ref
+
+  pdfx.backends.Reference.__lt__=__lt__
+
+E=None
+
+def run():
+  global pdf, limited
+  if sys.argv[1]=='-f':
+    # flatten
+    flatten=True
+    sys.argv.pop(1)
+  else:
+    flatten=False
+  try:
+    pdf=pdfx.PDFx(sys.argv[1],**limited)
+    if flatten:
+      links=pdf.get_references(sort=True)
+    else:
+      links=pdf.get_references_as_dict(sort=True)
+  except:
+    traceback.print_exc()
+    print("\nFailed: %s"%sys.argv[1],E,file=sys.stderr)
+    exit(1)
+  if pdf.limited:
+    print("Timed out, no text or scraping",file=sys.stderr)
+  if flatten:
+    for l in links:
+      print(l)
+  else:
+    for k in links.keys():
+      for l in links[k]:
+           print("%s\t%s"%(k,l))
+
+limited={}
+if sys.argv[1]=='-v':
+  # verbose: log level debug
+  sys.argv.pop(1)
+  import logging
+  logging.basicConfig(level=logging.DEBUG,format='%(asctime)s %(message)s',
+                      datefmt='%m/%d/%Y %I:%M:%S %p')
+  logging.getLogger(name='stopit').addHandler(logging.StreamHandler(sys.stderr))
+  logging.getLogger(name='pdfx').addHandler(logging.StreamHandler(sys.stderr))
+  logging.getLogger(name='pdfminer').setLevel(logging.WARN)
+  
+if sys.argv[1]=='-r':
+  # timeout for reading
+  sys.argv.pop(1)
+  limited['readTimeout']=float(sys.argv.pop(1))
+if sys.argv[1]=='-t':
+  # timeout for text recovery
+  sys.argv.pop(1)
+  limited['textTimeout']=float(sys.argv.pop(1))
+
+if sys.argv[1]=='-x':
+  import timeit
+  sys.argv.pop(1)
+  n=sys.argv[1]
+  sys.argv.pop(1)
+  print(timeit.timeit("run()",number=int(n),
+                      setup="from __main__ import run"),file=sys.stderr)
+else:
+  run()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/twt.py	Fri Jan 17 15:45:26 2025 +0000
@@ -0,0 +1,24 @@
+from twitter.twitter import *
+import cld2full
+
+tt=xtwc.sents('20100128.txt')
+corp=[]
+
+for t in tt:
+ at=[w.lower() for w in t if w.isalpha()]
+ if len(at) >= 5:
+  corp.append((' '.join(at)).encode('utf8'))
+len(corp)
+
+rcorp=[r for r in ((cld2full.detect(t),t) for t in corp) if r[0].is_reliable]
+len(rcorp)
+
+mecorp=[r for r in rcorp if r[0].details[0].language_code=='en']
+len(mecorp)
+
+eecorp=[r for r in mecorp if r[0].details[1].language_code=='un']
+len(eecorp)
+
+necorp=[r for r in rcorp if (r[0].details[0].language_code!='en') and (r[0].details[1].language_code!='en') and (r[0].details[2].language_code!='en')]
+len(necorp)
+