Mercurial > hg > python
changeset 0:fee51ab07d09
blanket publication of all existing python files in lib/python on maritain
author | Henry S. Thompson <ht@inf.ed.ac.uk> |
---|---|
date | Mon, 09 Mar 2020 14:58:04 +0000 |
parents | |
children | 0a3abe59e364 |
files | bobi.py boxi.py decrypt.py modify.py nag.py ngram.py ngram_3.0.py nono.py pdfCrawl.py ptrace.py signif.py strace_summarise.py twitter.py update.py withdraw.py |
diffstat | 15 files changed, 1684 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bobi.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,74 @@ +#!/bin/python +from sys import stdin +from urllib2 import Request,urlopen, HTTPError + +l='' +year='2015' +uuns={} + +def cc(names): + return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split())) + +while l=='': + l=stdin.readline().rstrip() +oldf=l.find("ILCC PGR live applications: ")==0 +if ((not oldf) and + l.find("EUCLID Admissions core applicant data - applications for next session")!=0 and + l.find("HST PGR applicants incl Beihang")!=0): + print "Not what I was expecting: %s"%l + exit(1) +while l.find('UUN\t')<0: + l=stdin.readline().rstrip() +for l in stdin: + l=l.rstrip().decode('latin-1'); + if l=='': + continue + + ff=l.split('\t') + if oldf: + if len(ff)==9: + (uun,surname,forenames,cat,stat,dec,entry,email,country)=ff + pgrm='PRPHDLNGCC1F' + nat='Unknown' + else: + print "Bad old-style input: %s"%('|'.join(l.split('\t'))) + continue + else: + if len(ff)==11: + (nat,stat,pgrm,dec,surname,forenames,entry,cat,uun,email,country)=ff + else: + print "Bad new-style input: %s"%('|'.join(l.split('\t'))) + continue + if uun in uuns: + print "!!! Careful !!!: %s seen before today with status %s, now %s"%(uun,uuns[uun],stat) + else: + uuns[uun]=stat + surname=cc(surname) + forenames=cc(forenames) + if pgrm=='PRPHDLNGCC1F': + ptype='I' + else: + ptype='B' + req='<app year="%s" uun="%s" type="PHD %s" surname="%s" forenames="%s" cat="%s" stat="%s" decision="%s" pgm="PhD ILCC" entry="%s" email="%s" country="%s" nationality="%s"/>'%(year,uun,ptype,surname,forenames,cat,stat,dec,entry,email,country,nat) + #print req.encode('iso-8859-1') + #continue + r=Request("http://localhost:8080/exist/apps/phd/new-app-maybe.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1) + res=res.read() + print ptype,res + if (not oldf) and res.find("<div>We already")==0: + req='<update year="%s" uun="%s" nationality="%s"/>'%(year,uun,nat) + r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/boxi.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,57 @@ +#!/bin/python +from sys import stdin +from urllib2 import Request,urlopen, HTTPError +from base64 import b64encode +import re +type='xyzzy' +l='' +year='2014' +tsplit=re.compile('\t\t*') + +def cc(names): + return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split())) + +while l=='': + l=stdin.readline().rstrip() +if l!="\t\tPG Applications List for ILCC+Henry S. Thompson+ht+Informatics_Amended : 456344": + print "Not what I was expecting: %s"%l + exit(1) +for l in stdin: + l=l.rstrip().decode('latin-1'); + if l=='': + continue + if l.find('Count:')==0: + exit() + if l.find('Pgm Code')==0: + continue + if l.find(type)==0: + continue + + ff=l.split('\t') + if len(ff)==9: + (tf,uun,name,cat,stat,email,country,pgm,entry)=ff + elif len(ff)==8: + (tf,uun,name,cat,stat,email,pgm,entry)=ff + country="" + else: + print "Bad input: %s"%('|'.join(l.split('\t'))) + continue + if tf!='': + type=tf + #if stat not in ('SD','SP'): + # continue + (sn,fn)=name.split(", ") + surname=cc(sn) + forenames=cc(fn) + req='<app year="%s" uun="%s" type="%s" surname="%s" forenames="%s" cat="%s" stat="%s" pgm="%s" entry="%s" email="%s" country="%s"/>'%(year,uun,type,surname,forenames,cat,stat,pgm,entry,email,country) + #print req.encode('iso-8859-1') + #continue + r=Request("http://localhost:8080/exist/apps/phd/new-app-maybe.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1) + print res.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decrypt.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# decrypt tp-link config.bin file +# coded by root@kev7n.com + +from Crypto.Cipher import DES +from hashlib import md5 +import sys + +# backup your config.bin from 192.168.x.1 +# usage find PPPOE account +# ./run.py wan_ppp_usr +# keys: wan_ppp_usr,wan_ppp_pwd + + +key = '\x47\x8D\xA5\x0B\xF9\xE3\xD2\xCF' +crypto = DES.new(key, DES.MODE_ECB) + +if len(sys.argv)>1: + f=sys.argv[1] +else: + f='conf.bin' + +data = open(f, 'rb').read() +data_decrypted = crypto.decrypt(data).rstrip('\0') +assert data_decrypted[:16] == md5(data_decrypted[16:]).digest() +data_decrypted_finally = data_decrypted[16:] +data_decrypted_dict = {} +data_decrypted_array = data_decrypted_finally.split('\r\n') +for item in data_decrypted_array: + if not item: + continue + item_array = item.split(' ', 1) + item_key = item_array[0] + item_value = item_array[1] + data_decrypted_dict[item_key] = item_value + +sys.stdout.write(data_decrypted_finally)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/modify.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,36 @@ +#!/bin/python +# Usage: modify.py uun fields... +from sys import stdin,argv +from urllib2 import Request,urlopen, HTTPError + +l='' +year='2014' +uuns={} + +def cc(names): + return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split())) + +eargs=['uun'] +eargs.extend(argv[1:]) + +for l in stdin: + l=l.rstrip().decode('latin-1'); + if l=='': + continue + try: #uun,... + vals=l.split("\t") + except ValueError: + print "Bad input: %s"%l + continue + attrs=" ".join(map(lambda (n,v):'%s="%s"'%(n,v),zip(eargs,vals))) + req='<update year="%s" %s/>'%(year,attrs) + print req + r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1) + print res.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nag.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,24 @@ +#!/bin/python +# Create a Clockwork object using your API key +from clockwork import clockwork +from sys import stdin +from rfc822 import Message + +msg=Message(stdin,False) + +frm=msg.get('from') +if (frm!="nagios@nagios2.skywalker.privatedns.com" and frm!='"Henry S. Thompson" <ht@inf.ed.ac.uk>'): + print "SMS not from nagios: %s"%frm + exit(1) + +api = clockwork.API("0a778e372c3582eeef36b5f7f580113067e82d76") +message = clockwork.SMS( to = "447866471388", + message = msg.fp.read(), + from_name="Nagios") +response = api.send(message) + +if response.success: + print "SMS sent %s"%response.id +else: + print "SMS failed %s: %s"%(response.error_code,response.error_description) + exit(2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngram.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,345 @@ +# Natural Language Toolkit: Language Models +# +# Copyright (C) 2001-2009 NLTK Project +# Author: Steven Bird <sb@csse.unimelb.edu.au> +# URL: <http://www.nltk.org/> +# For license information, see LICENSE.TXT + +import random, types +from itertools import chain +from math import log + +from nltk.probability import (ConditionalProbDist, ConditionalFreqDist, + MLEProbDist, FreqDist) +try: + from nltk.util import ingrams +except: + from nltkx.util import ingrams + +from api import * + +class NgramModel(ModelI): + """ + A processing interface for assigning a probability to the next word. + """ + + def __init__(self, n, train, pad_left=False, pad_right=False, + estimator=None, *estimator_args, **estimator_kwargs): + """ + Creates an ngram language model to capture patterns in n consecutive + words of training text. An estimator smooths the probabilities derived + from the text and may allow generation of ngrams not seen during + training. + + @param n: the order of the language model (ngram size) + @type n: C{int} + @param train: the training text + @type train: C{list} of C{list} of C{string} + @param estimator: a function for generating a probability distribution + @type estimator: a function that takes a C{ConditionalFreqDist} and + returns a C{ConditionalProbDist} + @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings + @type pad_left: bool + @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings + @type pad_right: bool + @param estimator_args: Extra arguments for estimator. + These arguments are usually used to specify extra + properties for the probability distributions of individual + conditions, such as the number of bins they contain. + Note: For backward-compatibility, if no arguments are specified, the + number of bins in the underlying ConditionalFreqDist are passed to + the estimator as an argument. + @type estimator_args: (any) + @param estimator_kwargs: Extra keyword arguments for the estimator + @type estimator_kwargs: (any) + """ + # protection from cryptic behavior for calling programs + # that use the pre-2.0.2 interface + assert(isinstance(pad_left, bool)) + assert(isinstance(pad_right, bool)) + + self._n = n + self._W = len(train) + self._lpad = ('<s>',) * (n - 1) if pad_left else () + # Need _rpad even for unigrams or padded entropy will give + # wrong answer because '' will be treated as unseen... + self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else () + self._padLen = len(self._lpad)+len(self._rpad) + + self._N=0 + delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent + + if estimator is None: + assert (estimator_args is None) and (estimator_kwargs is None),\ + "estimator_args or _kwargs supplied, but no estimator" + estimator = lambda fdist, bins: MLEProbDist(fdist) + + # Given backoff, a generator isn't acceptable + if isinstance(train,types.GeneratorType): + train=list(train) + + if n == 1: + if pad_right: + sents=(chain(s,self._rpad) for s in train) + else: + sents=train + fd=FreqDist() + for s in sents: + fd.update(s) + if not estimator_args and not estimator_kwargs: + self._model = estimator(fd,fd.B()) + else: + self._model = estimator(fd,fd.B(), + *estimator_args, **estimator_kwargs) + self._N=fd.N() + else: + cfd = ConditionalFreqDist() + self._ngrams = set() + + for sent in train: + self._N+=len(sent)+delta + for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): + self._ngrams.add(ngram) + context = tuple(ngram[:-1]) + token = ngram[-1] + cfd[context][token]+=1 + if not estimator_args and not estimator_kwargs: + self._model = ConditionalProbDist(cfd, estimator, len(cfd)) + else: + self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) + + # recursively construct the lower-order models + if n > 1: + self._backoff = NgramModel(n-1, train, pad_left, pad_right, + estimator, *estimator_args, **estimator_kwargs) + + # Code below here in this method, and the _words_following and _alpha method, are from + # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" + self._backoff_alphas = dict() + # For each condition (or context) + #print cfd,cfd.conditions() + for ctxt in cfd.conditions(): + backoff_ctxt = ctxt[1:] + backoff_total_pr = 0.0 + total_observed_pr = 0.0 + + # this is the subset of words that we OBSERVED following + # this context. + # i.e. Count(word | context) > 0 + wf=list(self._words_following(ctxt, cfd)) + for word in self._words_following(ctxt, cfd): + total_observed_pr += self.prob(word, ctxt) + # we also need the total (n-1)-gram probability of + # words observed in this n-gram context + backoff_total_pr += self._backoff.prob(word, backoff_ctxt) + assert (0 <= total_observed_pr <= 1),\ + "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr) + # beta is the remaining probability weight after we factor out + # the probability of observed words. + # As a sanity check, both total_observed_pr and backoff_total_pr + # must be GE 0, since probabilities are never negative + beta = 1.0 - total_observed_pr + + # if backoff total is 1, that should mean that all samples occur in this context, + # so we will never back off. + # Greater than 1 is an error. + assert (0 <= backoff_total_pr < 1), \ + "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) + alpha_ctxt = beta / (1.0 - backoff_total_pr) + + self._backoff_alphas[ctxt] = alpha_ctxt + + def _words_following(self, context, cond_freq_dist): + return cond_freq_dist[context].iterkeys() + # below from http://www.nltk.org/_modules/nltk/model/ngram.html, + # depends on new CFD??? + #for ctxt, word in cond_freq_dist.iterkeys(): + # if ctxt == context: + # yield word + + def prob(self, word, context, verbose=False): + """ + Evaluate the probability of this word in this context + using Katz Backoff. + """ + assert(isinstance(word,types.StringTypes)) + context = tuple(context) + if self._n==1: + if not(self._model.SUM_TO_ONE): + # Smoothing models should do the right thing for unigrams + # even if they're 'absent' + return self._model.prob(word) + else: + try: + return self._model.prob(word) + except: + raise RuntimeError("No probability mass assigned" + "to unigram %s" % (word)) + if context + (word,) in self._ngrams: + return self[context].prob(word) + else: + alpha=self._alpha(context) + if alpha>0: + if verbose: + print "backing off for %s"%(context+(word,),) + return alpha * self._backoff.prob(word, context[1:],verbose) + else: + if verbose: + print "no backoff for %s as model doesn't do any smoothing"%word + return alpha + + def _alpha(self, context,verbose=False): + """Get the backoff alpha value for the given context + """ + error_message = "Alphas and backoff are not defined for unigram models" + assert (not self._n == 1), error_message + + if context in self._backoff_alphas: + res = self._backoff_alphas[context] + else: + res = 1 + if verbose: + print " alpha: %s = %s"%(context,res) + return res + + + def logprob(self, word, context,verbose=False): + """ + Evaluate the (negative) log probability of this word in this context. + """ + + return -log(self.prob(word, context,verbose), 2) + + # NB, this will always start with same word since model + # is trained on a single text + def generate(self, num_words, context=()): + '''Generate random text based on the language model.''' + text = list(context) + for i in range(num_words): + text.append(self._generate_one(text)) + return text + + def _generate_one(self, context): + context = (self._prefix + tuple(context))[-self._n+1:] + # print "Context (%d): <%s>" % (self._n, ','.join(context)) + if context in self: + return self[context].generate() + elif self._n > 1: + return self._backoff._generate_one(context[1:]) + else: + return '.' + + def entropy(self, text, pad_left=False, pad_right=False, + verbose=False, perItem=False): + """ + Evaluate the total entropy of a text with respect to the model. + This is the sum of the log probability of each word in the message. + """ + # This version takes account of padding for greater accuracy + e = 0.0 + for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n): + context = tuple(ngram[:-1]) + token = ngram[-1] + cost=self.logprob(token, context, verbose) # _negative_ + # log2 prob == cost! + if verbose: + print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost) + e += cost + if perItem: + return e/((len(text)+self._padLen)-(self._n - 1)) + else: + return e + + def dump(self, file, logBase=None, precision=7): + """Dump this model in SRILM/ARPA/Doug Paul format + + Use logBase=10 and the default precision to get something comparable + to SRILM ngram-model -lm output + @param file to dump to + @type file file + @param logBase If not None, output logBases to the specified base + @type logBase int|None""" + file.write('\n\\data\\\n') + self._writeLens(file) + self._writeModels(file,logBase,precision,None) + file.write('\\end\\\n') + + def _writeLens(self,file): + if self._n>1: + self._backoff._writeLens(file) + file.write('ngram %s=%s\n'%(self._n, + sum(len(self._model[c].samples())\ + for c in self._model.keys()))) + else: + file.write('ngram 1=%s\n'%len(self._model.samples())) + + + def _writeModels(self,file,logBase,precision,alphas): + if self._n>1: + self._backoff._writeModels(file,logBase,precision,self._backoff_alphas) + file.write('\n\\%s-grams:\n'%self._n) + if self._n==1: + self._writeProbs(self._model,file,logBase,precision,(),alphas) + else: + for c in sorted(self._model.conditions()): + self._writeProbs(self._model[c],file,logBase,precision, + c,alphas) + + def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas): + if self._n==1: + for k in sorted(pd.samples()+['<unk>','<s>']): + if k=='<s>': + file.write('-99') + elif k=='<unk>': + _writeProb(file,logBase,precision,1-pd.discount()) + else: + _writeProb(file,logBase,precision,pd.prob(k)) + file.write('\t%s'%k) + if k not in ('</s>','<unk>'): + file.write('\t') + _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) + file.write('\n') + else: + ctxtString=' '.join(ctxt) + for k in sorted(pd.samples()): + _writeProb(file,logBase,precision,pd.prob(k)) + file.write('\t%s %s'%(ctxtString,k)) + if alphas is not None: + file.write('\t') + _writeProb(file,logBase,precision,alphas[ctxt+(k,)]) + file.write('\n') + + def __contains__(self, item): + try: + return item in self._model + except: + try: + # hack if model is an MLEProbDist, more efficient + return item in self._model._freqdist + except: + return item in self._model.samples() + + def __getitem__(self, item): + return self._model[item] + + def __repr__(self): + return '<NgramModel with %d %d-grams>' % (self._N, self._n) + +def _writeProb(file,logBase,precision,p): + file.write('%.*g'%(precision, + p if logBase is None else log(p,logBase))) + +def demo(): + from nltk.corpus import brown + from nltk.probability import LidstoneProbDist, WittenBellProbDist + estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) +# estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) + lm = NgramModel(3, brown.words(categories='news'), estimator) + print lm +# print lm.entropy(sent) + text = lm.generate(100) + import textwrap + print '\n'.join(textwrap.wrap(' '.join(text))) + +if __name__ == '__main__': + demo()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ngram_3.0.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,300 @@ +# Natural Language Toolkit: Language Models +# +# Copyright (C) 2001-2014 NLTK Project +# Authors: Steven Bird <stevenbird1@gmail.com> +# Daniel Blanchard <dblanchard@ets.org> +# Ilia Kurenkov <ilia.kurenkov@gmail.com> +# URL: <http://nltk.org/> +# For license information, see LICENSE.TXT +######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14 +######## Not actually part of 3.0 release +######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3" +from __future__ import unicode_literals + +from itertools import chain +from math import log + +from nltk.probability import (FreqDist, + ConditionalProbDist, + ConditionalFreqDist, + LidstoneProbDist) +from nltk.util import ngrams +from nltk.model.api import ModelI + +from nltk import compat + + +def _estimator(fdist, *estimator_args, **estimator_kwargs): + """ + Default estimator function using a SimpleGoodTuringProbDist. + """ + # can't be an instance method of NgramModel as they + # can't be pickled either. + return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs) + + +@compat.python_2_unicode_compatible +class NgramModel(ModelI): + """ + A processing interface for assigning a probability to the next word. + """ + + def __init__(self, n, train, pad_left=True, pad_right=False, + estimator=None, *estimator_args, **estimator_kwargs): + """ + Create an ngram language model to capture patterns in n consecutive + words of training text. An estimator smooths the probabilities derived + from the text and may allow generation of ngrams not seen during + training. + + >>> from nltk.corpus import brown + >>> from nltk.probability import LidstoneProbDist + >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) + >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) + >>> lm + <NgramModel with 91603 3-grams> + >>> lm._backoff + <NgramModel with 62888 2-grams> + >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', + ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', + ... 'primary', 'election', 'produced', '``', 'no', 'evidence', + ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) + ... # doctest: +ELLIPSIS + 0.5776... + + :param n: the order of the language model (ngram size) + :type n: int + :param train: the training text + :type train: list(str) or list(list(str)) + :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings + :type pad_left: bool + :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings + :type pad_right: bool + :param estimator: a function for generating a probability distribution + :type estimator: a function that takes a ConditionalFreqDist and + returns a ConditionalProbDist + :param estimator_args: Extra arguments for estimator. + These arguments are usually used to specify extra + properties for the probability distributions of individual + conditions, such as the number of bins they contain. + Note: For backward-compatibility, if no arguments are specified, the + number of bins in the underlying ConditionalFreqDist are passed to + the estimator as an argument. + :type estimator_args: (any) + :param estimator_kwargs: Extra keyword arguments for the estimator + :type estimator_kwargs: (any) + """ + + # protection from cryptic behavior for calling programs + # that use the pre-2.0.2 interface + assert(isinstance(pad_left, bool)) + assert(isinstance(pad_right, bool)) + + # make sure n is greater than zero, otherwise print it + assert (n > 0), n + + # For explicitness save the check whether this is a unigram model + self.is_unigram_model = (n == 1) + # save the ngram order number + self._n = n + # save left and right padding + self._lpad = ('',) * (n - 1) if pad_left else () + self._rpad = ('',) * (n - 1) if pad_right else () + + if estimator is None: + estimator = _estimator + + cfd = ConditionalFreqDist() + + # set read-only ngrams set (see property declaration below to reconfigure) + self._ngrams = set() + + # If given a list of strings instead of a list of lists, create enclosing list + if (train is not None) and isinstance(train[0], compat.string_types): + train = [train] + + for sent in train: + raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='') + for ngram in raw_ngrams: + self._ngrams.add(ngram) + context = tuple(ngram[:-1]) + token = ngram[-1] + cfd[(context, token)] += 1 + + self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs) + + # recursively construct the lower-order models + if not self.is_unigram_model: + self._backoff = NgramModel(n-1, train, + pad_left, pad_right, + estimator, + *estimator_args, + **estimator_kwargs) + + self._backoff_alphas = dict() + # For each condition (or context) + for ctxt in cfd.conditions(): + backoff_ctxt = ctxt[1:] + backoff_total_pr = 0.0 + total_observed_pr = 0.0 + + # this is the subset of words that we OBSERVED following + # this context. + # i.e. Count(word | context) > 0 + for word in self._words_following(ctxt, cfd): + total_observed_pr += self.prob(word, ctxt) + # we also need the total (n-1)-gram probability of + # words observed in this n-gram context + backoff_total_pr += self._backoff.prob(word, backoff_ctxt) + + assert (0 <= total_observed_pr <= 1), total_observed_pr + # beta is the remaining probability weight after we factor out + # the probability of observed words. + # As a sanity check, both total_observed_pr and backoff_total_pr + # must be GE 0, since probabilities are never negative + beta = 1.0 - total_observed_pr + + # backoff total has to be less than one, otherwise we get + # an error when we try subtracting it from 1 in the denominator + assert (0 <= backoff_total_pr < 1), backoff_total_pr + alpha_ctxt = beta / (1.0 - backoff_total_pr) + + self._backoff_alphas[ctxt] = alpha_ctxt + + def _words_following(self, context, cond_freq_dist): + for ctxt, word in cond_freq_dist.iterkeys(): + if ctxt == context: + yield word + + def prob(self, word, context): + """ + Evaluate the probability of this word in this context using Katz Backoff. + + :param word: the word to get the probability of + :type word: str + :param context: the context the word is in + :type context: list(str) + """ + context = tuple(context) + if (context + (word,) in self._ngrams) or (self.is_unigram_model): + return self._probdist.prob((context, word)) + else: + return self._alpha(context) * self._backoff.prob(word, context[1:]) + + def _alpha(self, context): + """Get the backoff alpha value for the given context + """ + error_message = "Alphas and backoff are not defined for unigram models" + assert not self.is_unigram_model, error_message + + if context in self._backoff_alphas: + return self._backoff_alphas[context] + else: + return 1 + + def logprob(self, word, context): + """ + Evaluate the (negative) log probability of this word in this context. + + :param word: the word to get the probability of + :type word: str + :param context: the context the word is in + :type context: list(str) + """ + return -log(self.prob(word, context), 2) + + @property + def ngrams(self): + return self._ngrams + + @property + def backoff(self): + return self._backoff + + @property + def probdist(self): + return self._probdist + + def choose_random_word(self, context): + ''' + Randomly select a word that is likely to appear in this context. + + :param context: the context the word is in + :type context: list(str) + ''' + + return self.generate(1, context)[-1] + + # NB, this will always start with same word if the model + # was trained on a single text + + def generate(self, num_words, context=()): + ''' + Generate random text based on the language model. + + :param num_words: number of words to generate + :type num_words: int + :param context: initial words in generated string + :type context: list(str) + ''' + + text = list(context) + for i in range(num_words): + text.append(self._generate_one(text)) + return text + + def _generate_one(self, context): + context = (self._lpad + tuple(context))[- self._n + 1:] + if context in self: + return self[context].generate() + elif self._n > 1: + return self._backoff._generate_one(context[1:]) + else: + return '.' + + def entropy(self, text): + """ + Calculate the approximate cross-entropy of the n-gram model for a + given evaluation text. + This is the average log probability of each word in the text. + + :param text: words to use for evaluation + :type text: list(str) + """ + + e = 0.0 + text = list(self._lpad) + text + list(self._rpad) + for i in range(self._n - 1, len(text)): + context = tuple(text[i - self._n + 1:i]) + token = text[i] + e += self.logprob(token, context) + return e / float(len(text) - (self._n - 1)) + + def perplexity(self, text): + """ + Calculates the perplexity of the given text. + This is simply 2 ** cross-entropy for the text. + + :param text: words to calculate perplexity of + :type text: list(str) + """ + + return pow(2.0, self.entropy(text)) + + def __contains__(self, item): + return tuple(item) in self._probdist.freqdist + + def __getitem__(self, item): + return self._probdist[tuple(item)] + + def __repr__(self): + return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n) + + +def teardown_module(module=None): + from nltk.corpus import brown + brown._unload() + +if __name__ == "__main__": + import doctest + doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nono.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,298 @@ +#!/usr/bin/python3 +# Expects e.g. ^A copy from Nonograms dprint preview cols, then blank line, then rows +# rows are space-separated +# cols are one-digit-after-another, unless some 2-digit, in which case x is separator +# E.g. +# 13x1x2 +# 19 +# maps to +# 13 +# 1 1 +# 2 9 + +import sys + +Red='[31m' +eRed='[39m' +RedFmt=Red+'%s'+eRed + +def interleave(*args): + for vals in zip(*args): + yield from vals + +class Vector(list): + # reads top-to-bottom or left-to-right + def __init__(self,n,m,runs): + list.__init__(self,list(range(n))) + self.n=n + self.runs=runs + # compute the set of all possible layouts for runs + self.rn=len(self.runs) + rtot=sum(self.runs) + self.allRuns=list(self.seedList(0,0,0, + sum(1+self.runs[k] for k in range(self.rn))-1)) + self.nar=len(self.allRuns) + + def seedList(self,i,j,pos,runLen): + """ + :param i: starting skip before next run + :type i: 0 if pos==0 else 1 + :param j: next run number + :type j: int + :param pos: left margin + :type pos: int + """ + bound=self.n-(pos+runLen)+1 + #dprint('s',i,j,pos,runLen,bound) + if j==self.rn: + yield [] + return + r=self.runs[j] + for v in range(i,bound): + for sub in self.seedList(1,j+1,pos+v+r,runLen-(r+1)): + yield [-v,r]+sub + + def __repr__(self): + return "V@%s%s:%s"%(self.x,self.runs,list.__repr__(self)) + + def __str__(self): + return '%s|'%('|'.join(str(c) for c in self)) + + def step(self): + scratch=[0 if c.val is None else c.val for c in self] + for k,runs in enumerate(self.allRuns): + dprint('=====pass %s======'%k) + self.onepass(0,self.n,scratch,runs.copy()) + dprint(scratch) + for i in range(self.n): + if scratch[i]==self.nar: + # If blobby in _every_ pass, then must be a blob + if self[i].val is None: + self[i].setVal(True) + elif self[i].val is True: + # already there + pass + else: + print("Shouldn't happen: attempt to blob where x already present! %s at %s"%(self,i),file=sys.stderr) + exit(101) + + def onepass(self,i0,iBound,scratch,stack): + """note that stack is not a simple run, but one with _negative_ numbers between + and possibly before the positive ones, indicating obligatory skips + """ + i=i0 # starting index into self/scratch/maybe + j=-1 # index into run + maybe=[0]*iBound + dprint('r: %s'%stack) + req=sum((-r if r<0 else r) for r in stack) + while stack and i<iBound: + r=rr=stack.pop(0) + dprint('pop:',r) + if r<1: + # obligatory skip + # (Above init of self.allRuns is easier if we allow a 0 to be ignored + i-=r + req+=r + r=rr=stack.pop(0) + # rr is run remaining -- how many we still need + j+=1 # index of current run in self.runs, we'll need to decorate that eventually + inOne=-1 # if non-neg, records the start point of a possible run + gapsFilled=0 + # First, check if we can start here: 0 is OK, and n>0 iff n-1 is None or False + if i>0 and i<iBound: + while self[i-1].val: + i+=1 + if (iBound-i)<req: + # Can't win, give up altogether + dprint('c0',i,iBound,req) + return + while i<iBound: + c=self[i].val + dprint('top',i,c,inOne,rr) + if c is None: + # we could add a blob here + dprint('c1') + gapsFilled+=1 + rr-=1 + if inOne<0: + dprint('c1a',i) + # starts here + inOne=i + # fall through to check for completion + else: + dprint('c2') + # c is a bool + if inOne<0: + dprint('c2a') + if c: + dprint('c2a1') + # a *, we can possible start something here + inOne=i + rr-=1 + # fall through to check for completion + else: + dprint('c2a2') + # an x, can't start here, just move along + i+=1 + continue + else: + dprint('c2b') + if c: + dprint('c2b1') + # a blob, extend or complete a partial + rr-=1 + # fall through to check for completion + else: + # abandon a partial + dprint('c2b2') + inOne=-1 + rr=r + i+=1 + continue + if rr>0: + dprint('c3') + # we're not done, carry on + i+=1 + continue + # Maybe a win? + # look ahead, can we stop here? + # NB _self_.n + if i+1<self.n and self[i+1].val: + dprint('c4') + # Nope + inOne=-1 + rr=r + gapsFilled=0 + i+=1 + continue + elif gapsFilled==0: + dprint('c5') + # We must have crossed at least on gap... + print("Shouldn't happen: no gap! me:%s i:%s j:%s rr:%s inOne:%s"%(self,i, j, rr, inOne),file=sys.stderr) + exit(100) + # Victory! + dprint('c6',r,inOne,i) + for k in range(inOne,i+1): + maybe[k]+=1 + i+=1 + req-=r + break + # on to the next run + # end of inner loop, did we win? + if (not stack) or i==iBound: + # yes + dprint('win:',maybe) + for k in range(iBound): + scratch[k]+=maybe[k] + +class Row(Vector): + def __init__(self,n,m,runs,pos,dprintWidth): + Vector.__init__(self,n,m,runs) + self.y=pos + self.dprintWidth=dprintWidth + self.fmt="%%%ss|"%dprintWidth + + def __str__(self): + return ((self.fmt%(' '.join(str(r) for r in self.runs)))+ + Vector.__str__(self)) + +class Column(Vector): + def __init__(self,n,m,runs,pos,dprintHeight): + Vector.__init__(self,n,m,runs) + self.x=pos + self.dprintHeight=dprintHeight + self.fmt="%%%ss"%self.dprintHeight + self.updateHeader() + + def updateHeader(self): + header=('-'.join(str(c) for c in self.runs)) + self.header=self.fmt%header # pad to same 'height' + +class Cell: + def __init__(self,row,y,column,x): + # At the intersection of row and column Vectors + self.row=row + self.column=column + self.x=x + self.y=y + self.val=None # three valued: None(unknown), True(filled), False(empty) + self.row[x]=self + self.column[y]=self + + def __repr__(self): + return "C@(%s,%s):%s"%(self.x,self.y,self.val) + + def __str__(self): + return ' ' if self.val is None else ('\u25A0' if self.val else 'x') + + def setVal(self,v): + if v is True: + if self.val is False: + dprint("Warning: x -> * at %s,%s"%(self.x,self.y)) + elif self.val is True: + # No-op + return + # @@ check row/col completed + else: + if self.val is not None: + dprint("Warning: %s -> %s at %s,%s"%(self.val,v,self.x,self.y)) + self.val=v + +class Nono(dict): + # 0,0 is upper left, so increasing y goes _downwards_, to match the standard layout + def __init__(self,rows,cols): + n=self.n=len(cols) + if n!=len(rows): + print("losing r:%s x c:%s"%(len(rows),n),sys.stderr) + exit(1) + self.rc=rows + rowDprintWidth=max(sum(len(str(r)) for r in row)+len(row)-1 for row in rows) + self.rowfmt="%s|%%s"%(' '*rowDprintWidth) + self.cc=cols + # dprint col nums>9 vertically :-( + self.colDprintHeight=max(sum(len(str(c)) for c in col)+len(col)-1 for col in cols) + self.columns=cc=[Column(n,self,cols[i],i,self.colDprintHeight) for i in range(20)] + self.rows=rr=[Row(n,self,rows[i],i,rowDprintWidth) for i in range(20)] + for x in range(20): + for y in range(20): + self[(x,y)]=Cell(rr[y],y,cc[x],x) + + def __str__(self): + lines=[self.rowfmt%('|'.join([(self.columns[i]).header[j] for i in range(self.n)])) # 'rotate' + for j in range(self.colDprintHeight)] + lines+=[str(r) for r in self.rows] + return "\n".join(lines) + +def dprint(*args): + pass + +if __name__ == '__main__': + if len(sys.argv)>1: + f=open(sys.argv[1]) + else: + f=sys.stdin + + cols=[] + + for l in f: + l=l.rstrip() + if l=='': + break + if 'x' in l: + vv=[int(s) for s in l.split('x')] + else: + vv=[int(c) for c in l] + cols.append(vv) + + rows=[[int(s) for s in l.split()] for l in f] + + solver=Nono(rows,cols) + print(solver) + for c in solver.columns: + c.step() + print() + print(solver) + for r in solver.rows: + r.step() + print() + print(solver)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pdfCrawl.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,24 @@ +import PyPDF2 as pyPdf, sys + +f = open(sys.argv[1],'rb') + +pdf = pyPdf.PdfFileReader(f) +pgs = pdf.getNumPages() +key = '/Annots' +uri = '/URI' +ank = '/A' + +#print pdf.getNamedDestinations() + +for pg in range(pgs): + print '#',pg + p = pdf.getPage(pg) + o = p.getObject() + #print >>sys.stderr,o + if o.has_key(key): + ann = o[key] + #print >>sys.stderr,key,ann + for a in ann: + u = a.getObject() + if u[ank].has_key(uri): + print "U",u[ank][uri]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ptrace.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,23 @@ +#!/usr/bin/python +# usage: ptrace.py TRACE [result of nm xemacs | egrep '[$_]'] +import sys +symfile=open(sys.argv[2]) +syms={} +for l in symfile: + (addr,rest)=l.rstrip().split(' ',1) + syms[addr]=rest +symfile.close() +trfile=open(sys.argv[1]) + +for l in trfile: + (what,rest)=l.rstrip().split(' ',1) + if what in ('incipit','exit','p'): + print l.rstrip() + continue + (where,when)=rest.split() + try: + (z,b)=where.split('x') + print "%8s %s %s"%(what,syms["00"+b],when) + except: + print "not hex: ",l.rstrip() +trfile.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/signif.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,162 @@ +from nltk import FreqDist +from random import randint +import pylab +from math import sqrt + +def mean(self): + # Assumes the keys of this distribution are numbers! + return float(sum(v*self[v] for v in self.keys()))/self.N() + +FreqDist.mean=mean + +def bell(self,maxVal=None,bars=False,**kwargs): + # Assumes the keys of this distribution are numbers! + if maxVal is not None: + sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1) + else: + sk=sorted(self.keys()) + print len(sk) + #sk.append(sk[-1]+1) + #sk[0:0]=[(sk[0]-1)] + mm=0 # sk[0] + mean = self.mean() + tot = 0 + ssd = 0 + for v in self.keys(): + d = v-mean + ssd+=d*d*self[v] + sd=sqrt(ssd/float(self.N())) + #print (mean,sd) + kv=[self[k] for k in sk] + pylab.figure().subplots_adjust(bottom=0.15) + pylab.plot(sk,kv,color='blue') + if kwargs['xtra']: + xtra=kwargs['xtra'] + pylab.plot(sk,[xtra[k] for k in sk],color='red') + if bars: + pylab.bar([s-mm for s in sk],kv, + align='center',color='white',edgecolor='pink') + pylab.xticks(sk,rotation=90) + mv=self[self.max()] + bb=(-mv/10,mv+(mv/10)) + pylab.plot((mean-mm,mean-mm),bb, + (mean-mm-sd,mean-mm-sd),bb, + (mean-mm-(2*sd),mean-mm-(2*sd)),bb, + (mean-mm+sd,mean-mm+sd),bb, + (mean-mm+(2*sd),mean-mm+(2*sd)),bb, + color='green') + pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd)) + pylab.show() + +FreqDist.bell=bell + +def ranks(l,**kvargs): + # compute the rank of every element in a list + # uses sort, passing on all kv args + # uses key kv arg itself + # _Very_ inefficient, in several ways! + # Result is a pair: + # list of ranks + # list of tie information, each elt the magnitude of a tie group + s=sorted(l,**kvargs) + i=0 + res=[] + td=[] + if kvargs.has_key('key'): + kf=kvargs['key'] + else: + kf=lambda x:x + while i<len(l): + ties=[x for x in s if kf(s[i])==kf(x)] + if len(ties)>1: + td.append(len(ties)) + r=float(i+1+(i+len(ties)))/2.0 + for e in ties: + res.append((r,e)) + i+=1 + return (res,td) + +def mannWhitneyU(fd1,fd2,forceZ=False): + # Compute Mann Whitney U test for two frequency distributions + # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf + # to look up significance levels on the result: see Part 3 section 10, + # actual page 150 (printed page 144) + # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you + # For n1 and n2 > 20, U itself is normally distributed, we + # return a tuple with a z-test value + # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!! + r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()] + r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()] + n1=len(r1) + n2=len(r2) + (ar,ties)=ranks(r1+r2,key=lambda e:e[1]) + s1=sum(r[0] for r in ar if r[1][2] is 1) + s2=sum(r[0] for r in ar if r[1][2] is 2) + u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1) + u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2) + u=min(u1,u2) + if forceZ or n1>20 or n2>20: + # we can treat U as sample from a normal distribution, and compute + # a z-score + # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf + mu=float(n1*n2)/2.0 + if len(ties)>0: + n=float(n1+n2) + ts=sum((float((t*t*t)-t)/12.0) for t in ties) + su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts)) + else: + su=sqrt(float(n1*n2*(n1+n2+1))/12.0) + z=(u-mu)/su + return (n1,n2,u,z) + else: + return (n1,n2,u) + +# This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html +# but has a number of bug fixes +def Rank(l,**kvargs): + # compute the rank of every element in a list + # uses sort, passing on all kv args + # uses key kv arg itself + # _Very_ inefficient, in several ways! + # Result is a list of pairs ( r, v) where r is a rank and v is an input value + s=sorted(l,**kvargs) + i=0 + res=[] + if kvargs.has_key('key'): + kf=kvargs['key'] + else: + kf=lambda x:x + while i<len(l): + ties=[x for x in s if kf(s[i])==kf(x)] + r=float(i+1+(i+len(ties)))/2.0 + #print (i,r,ties) + for e in ties: + res.append((r,e)) + i+=1 + return (res) + +def mannWhitney(S1, S2): + """ + Returns the Mann-Whitney U statistic of two samples S1 and S2. + """ + # Form a single array with a categorical variable indicate the sample + X = [(s, 0) for s in S1] + X.extend([(s,1) for s in S2]) + R = Rank(X,key=lambda x:x[0]) + + # Compute needed parameters. + n1 = float(len(S1)) + n2 = float(len(S2)) + + # Compute total ranks for sample 1. + R1 = sum([i for i, (x,j) in R if j == 0]) + R2 = sum([i for i, (x,j) in R if j == 1]) + u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1 + u2 = n1 * n2 - u1 + U = min(u1, u2) + #print u1,R1/n1,R2/n2 + + mU = n1 * n2 / 2.0 + sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0) + return u1, R1/n1,R2/n2, (U-mU)/sigmaU +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strace_summarise.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,83 @@ +#!/usr/bin/python +#---------------------------------------------------------------------- +# Description : Simplify strace output to allow for easier diffing. +# Author : James Hunt <james.hunt@ubuntu.com> +# Date : 24 July 2012 +#---------------------------------------------------------------------- +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2, as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +#---------------------------------------------------------------------- + +import os +import re +import sys +import string + +pids = {} + +def process_data(fh): + lines = fh.readlines() + possible_pid = 0 + using_pids = 0 + pid = 0 + pid_count = 1 + + line_num = 0 + for line in lines: + + line = line.strip() + line_num += 1 + fields = line.split() + if line_num % 10000 == 0: + print >> sys.stderr,line_num,len(fields),fields[1] + if len(fields) > 0: + result = re.match("\(?(\d{4,8})\)?", fields[1]) + #print >> sys.stderr,result.group(),result.group(1) + if result and result.group(1): + pid = result.group(1) + if pid in pids: + line = re.sub("(\\b"+pid+"\\b)", pids[pid], line) + else: + pid_name = "PID%d" % pid_count + line = re.sub("(\\b"+pid+"\\b)", pid_name, line) + pids[pid] = pid_name + pid_count += 1 + + # handle addresses (up to 64-bit) + line = re.sub("0x0{1,16}", "0xNULL", line) + line = re.sub("0x[0-9A-Fa-f]{1,16}", "0xADDR", line) + + # handle timestamps + line = re.sub("\d{2}:\d{2}:\d{2}", "HH:MM:SS", line) + line = re.sub("\d{4}/\d{2}/\d{2}", "YYYY/MM/DD", line) + + print line + + +def main(): + try: + script = sys.argv[0] + file1 = sys.argv[1] + except: + sys.exit("ERROR: usage: %s <file1> " % script) + + try: + fh1 = open(file1) + except: + sys.exit("ERROR: unable to open file '%s'" % file1) + + process_data(fh1) + print >>sys.stderr,pids + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/twitter.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,113 @@ +from nltk.corpus.reader.plaintext import PlaintextCorpusReader +from nltk.corpus.reader import RegexpTokenizer +from nltk.tokenize import LineTokenizer +from nltk.corpus.reader.util import read_line_block +from nltkx.model import NgramModel +from nltk import ConditionalFreqDist, ngrams,\ + chain, ConditionalProbDist, WittenBellProbDist, FreqDist +import types + +xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/", + r'2.*\.txt', + word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'), + sent_tokenizer=LineTokenizer(), + para_block_reader=read_line_block) + +def discount(self): + return float(self._N)/float(self._N + self._T) + +def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + # http://stackoverflow.com/a/33024979 + return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + +def check(self): + totProb=sum(self.prob(sample) for sample in self.samples()) + assert isclose(self.discount(),totProb),\ + "discount %s != totProb %s"%(self.discount(),totProb) + + +WittenBellProbDist.discount = discount +WittenBellProbDist.check = check + +def _estimator(fdist, bins): + """ + Default estimator function using WB. + """ + # can't be an instance method of NgramModel as they + # can't be pickled either. + res=WittenBellProbDist(fdist,fdist.B()+1) + res.check() + return res + +class LgramModel(NgramModel): + def __init__(self, n, train, pad_left=False, pad_right=False, + estimator=None, *estimator_args, **estimator_kwargs): + """ + Same as NgramModel (q.v.), but with a WittenBell default estimator + """ + if estimator is None: + assert (not(estimator_args)) and (not(estimator_kwargs)),\ + "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs) + estimator=_estimator + super(LgramModel,self).__init__(n, train, pad_left, pad_right, + estimator, + *estimator_args, **estimator_kwargs) + +from nltk.probability import _get_kwarg +try: + from nltk.probability import islice +except: + from nltk.util import islice + +def plotSorted(self, *args, **kwargs): + """ + Plot samples from the frequency distribution, + sorted using a supplied key function. If an integer + parameter is supplied, stop after this many samples have been + plotted. If two integer parameters m, n are supplied, plot a + subset of the samples, beginning with m and stopping at n-1. + For a cumulative plot, specify cumulative=True. + (Requires Matplotlib to be installed.) + + :param title: The title for the graph + :type title: str + :param key: a function to pass to sort to extract the sort key + given an FD and a sample id. + Defaults to the value of that sample's entry, + lambda fd,s:fd[s] + :type key: function + :param reverse: True to sort high to low + :type reverse: bool + """ + try: + import pylab + except ImportError: + raise ValueError('The plot function requires the matplotlib package (aka pylab). ' + 'See http://matplotlib.sourceforge.net/') + + if len(args) == 0: + args = [len(self)] + + keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s]) + reverse = _get_kwarg(kwargs, 'reverse', False) + + samples = list(islice(self, *args)) + samples.sort(key=lambda x:keyFn(self,x),reverse=reverse) + + freqs = [self[sample] for sample in samples] + ylabel = "Counts" + # percents = [f * 100 for f in freqs] only in ProbDist? + + pylab.grid(True, color="silver") + if not "linewidth" in kwargs: + kwargs["linewidth"] = 2 + if "title" in kwargs: + pylab.title(kwargs["title"]) + del kwargs["title"] + pylab.plot(freqs, **kwargs) + pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90) + pylab.xlabel("Samples") + pylab.ylabel(ylabel) + pylab.show() + +FreqDist.plotSorted=plotSorted
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/update.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,70 @@ +#!/bin/python +from sys import stdin,argv +from urllib2 import Request,urlopen, HTTPError +from base64 import b64encode +type='xyzzy' +l='' +year='2014' + +def cc(names): + return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split())) + +cm={'tf':0, + 'uun':1, + 'surname':2, + 'cat':3, + 'stat':4, + 'email':5, + 'country':6, + 'pgm':7, + 'entry':8, + 'forename':9} + +eargs=['uun'] +eargs.extend(argv[1:]) + +while l=='': + l=stdin.readline().rstrip() +if l!="\tPG Applications List for ILCC": + print "Not what I was expecting: %s"%l + exit(1) +for l in stdin: + l=l.rstrip().decode('latin-1'); + if l=='': + continue + if l.find('Count:')==0: + exit() + if l.find('Pgm Code')==0: + continue + if l.find('Entry')==len(l)-5: + continue + if l.find(type)==0: + continue + if l=='': + continue + try: #tf,uun,name,cat,stat,email,country,pgm,entry + vals=l.split("\t") + except ValueError: + print "Bad input: %s"%l + continue + if vals[0]!='': + type=vals[0] + try: + (sn,fn)=vals[2].split(", ") + except ValueError: + print "Bad input: %s"%l + exit + vals[2]=cc(sn) + vals.append(cc(fn)) + attrs=" ".join(map(lambda n:'%s="%s"'%(n,vals[cm[n]]),eargs)) + req='<update year="%s" %s/>'%(year,attrs) + print req + r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1) + print res.read()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/withdraw.py Mon Mar 09 14:58:04 2020 +0000 @@ -0,0 +1,38 @@ +#!/bin/python +from sys import stdin,argv +from urllib2 import Request,urlopen, HTTPError +from base64 import b64encode +type='xyzzy' +l='' +year='2014' + +def cc(names): + return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split())) + +eargs=['uun'] +eargs.extend(argv[1:]) + +while l=='': + l=stdin.readline().rstrip() +for l in stdin: + l=l.rstrip().decode('latin-1'); + if l=='': + continue + try: #uun,... + vals=l.split("\t") + except ValueError: + print "Bad input: %s"%l + continue + attrs=" ".join(map(lambda n,v:'%s="%s"'%(n,v),zip(eargs,vals))) + req='<update year="%s" %s/>'%(year,attrs) + print req + continue + r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq", + req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'}) + try: + res=urlopen(r) + except HTTPError as err: + print "Error:",err.read() + print req + exit(1) + print res.read()