changeset 0:fee51ab07d09

blanket publication of all existing python files in lib/python on maritain
author Henry S. Thompson <ht@inf.ed.ac.uk>
date Mon, 09 Mar 2020 14:58:04 +0000
parents
children 0a3abe59e364
files bobi.py boxi.py decrypt.py modify.py nag.py ngram.py ngram_3.0.py nono.py pdfCrawl.py ptrace.py signif.py strace_summarise.py twitter.py update.py withdraw.py
diffstat 15 files changed, 1684 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bobi.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,74 @@
+#!/bin/python
+from sys import stdin
+from urllib2 import Request,urlopen, HTTPError
+
+l=''
+year='2015'
+uuns={}
+
+def cc(names):
+  return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split()))
+
+while l=='':
+  l=stdin.readline().rstrip()
+oldf=l.find("ILCC PGR live applications: ")==0
+if ((not oldf) and
+    l.find("EUCLID Admissions core applicant data - applications for next session")!=0 and
+    l.find("HST PGR applicants incl Beihang")!=0):
+  print "Not what I was expecting: %s"%l
+  exit(1)
+while l.find('UUN\t')<0:
+  l=stdin.readline().rstrip()
+for l in stdin:
+  l=l.rstrip().decode('latin-1');
+  if l=='':
+    continue
+  
+  ff=l.split('\t')
+  if oldf:
+    if len(ff)==9:
+      (uun,surname,forenames,cat,stat,dec,entry,email,country)=ff
+      pgrm='PRPHDLNGCC1F'
+      nat='Unknown'
+    else:
+      print "Bad old-style input: %s"%('|'.join(l.split('\t')))
+      continue
+  else:
+    if len(ff)==11:
+      (nat,stat,pgrm,dec,surname,forenames,entry,cat,uun,email,country)=ff
+    else:
+      print "Bad new-style input: %s"%('|'.join(l.split('\t')))
+      continue
+  if uun in uuns:
+    print "!!! Careful !!!: %s seen before today with status %s, now %s"%(uun,uuns[uun],stat)
+  else:
+    uuns[uun]=stat
+  surname=cc(surname)
+  forenames=cc(forenames)
+  if pgrm=='PRPHDLNGCC1F':
+    ptype='I'
+  else:
+    ptype='B'
+  req='<app year="%s" uun="%s" type="PHD %s" surname="%s" forenames="%s" cat="%s" stat="%s" decision="%s" pgm="PhD ILCC" entry="%s" email="%s" country="%s" nationality="%s"/>'%(year,uun,ptype,surname,forenames,cat,stat,dec,entry,email,country,nat)
+  #print req.encode('iso-8859-1')
+  #continue
+  r=Request("http://localhost:8080/exist/apps/phd/new-app-maybe.xq",
+            req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+  try:
+    res=urlopen(r)
+  except HTTPError as err:
+    print "Error:",err.read()
+    print req
+    exit(1)
+  res=res.read()
+  print ptype,res
+  if (not oldf) and res.find("<div>We already")==0:
+    req='<update year="%s" uun="%s" nationality="%s"/>'%(year,uun,nat)
+    r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq",
+              req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+    try:
+      res=urlopen(r)
+    except HTTPError as err:
+      print "Error:",err.read()
+      print req
+      exit(1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/boxi.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,57 @@
+#!/bin/python
+from sys import stdin
+from urllib2 import Request,urlopen, HTTPError
+from base64 import b64encode
+import re
+type='xyzzy'
+l=''
+year='2014'
+tsplit=re.compile('\t\t*')
+
+def cc(names):
+  return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split()))
+
+while l=='':
+  l=stdin.readline().rstrip()
+if l!="\t\tPG Applications List for ILCC+Henry S. Thompson+ht+Informatics_Amended : 456344":
+  print "Not what I was expecting: %s"%l
+  exit(1)
+for l in stdin:
+  l=l.rstrip().decode('latin-1');
+  if l=='':
+    continue
+  if l.find('Count:')==0:
+    exit()
+  if l.find('Pgm Code')==0:
+    continue
+  if l.find(type)==0:
+    continue
+  
+  ff=l.split('\t')
+  if len(ff)==9:
+    (tf,uun,name,cat,stat,email,country,pgm,entry)=ff
+  elif len(ff)==8:
+    (tf,uun,name,cat,stat,email,pgm,entry)=ff
+    country=""
+  else:
+    print "Bad input: %s"%('|'.join(l.split('\t')))
+    continue
+  if tf!='':
+    type=tf
+  #if stat not in ('SD','SP'):
+  #  continue
+  (sn,fn)=name.split(", ")
+  surname=cc(sn)
+  forenames=cc(fn)
+  req='<app year="%s" uun="%s" type="%s" surname="%s" forenames="%s" cat="%s" stat="%s" pgm="%s" entry="%s" email="%s" country="%s"/>'%(year,uun,type,surname,forenames,cat,stat,pgm,entry,email,country)
+  #print req.encode('iso-8859-1')
+  #continue
+  r=Request("http://localhost:8080/exist/apps/phd/new-app-maybe.xq",
+            req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+  try:
+    res=urlopen(r)
+  except HTTPError as err:
+    print "Error:",err.read()
+    print req
+    exit(1)
+  print res.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decrypt.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# decrypt tp-link config.bin file
+# coded by root@kev7n.com
+
+from Crypto.Cipher import DES
+from hashlib import md5
+import sys
+
+# backup your config.bin from 192.168.x.1
+# usage find PPPOE account
+# ./run.py wan_ppp_usr
+# keys: wan_ppp_usr,wan_ppp_pwd
+
+
+key = '\x47\x8D\xA5\x0B\xF9\xE3\xD2\xCF'
+crypto = DES.new(key, DES.MODE_ECB)
+
+if len(sys.argv)>1:
+  f=sys.argv[1]
+else:
+  f='conf.bin'
+
+data = open(f, 'rb').read()
+data_decrypted = crypto.decrypt(data).rstrip('\0')
+assert data_decrypted[:16] == md5(data_decrypted[16:]).digest()
+data_decrypted_finally = data_decrypted[16:]
+data_decrypted_dict = {}
+data_decrypted_array = data_decrypted_finally.split('\r\n')
+for item in data_decrypted_array:
+    if not item:
+        continue
+    item_array = item.split(' ', 1)
+    item_key = item_array[0]
+    item_value = item_array[1]
+    data_decrypted_dict[item_key] = item_value
+
+sys.stdout.write(data_decrypted_finally)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/modify.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,36 @@
+#!/bin/python
+# Usage: modify.py uun fields...
+from sys import stdin,argv
+from urllib2 import Request,urlopen, HTTPError
+
+l=''
+year='2014'
+uuns={}
+
+def cc(names):
+  return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split()))
+
+eargs=['uun']
+eargs.extend(argv[1:])
+
+for l in stdin:
+  l=l.rstrip().decode('latin-1');
+  if l=='':
+    continue
+  try: #uun,...
+    vals=l.split("\t")
+  except ValueError:
+    print "Bad input: %s"%l
+    continue
+  attrs=" ".join(map(lambda (n,v):'%s="%s"'%(n,v),zip(eargs,vals)))
+  req='<update year="%s" %s/>'%(year,attrs)
+  print req
+  r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq",
+            req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+  try:
+    res=urlopen(r)
+  except HTTPError as err:
+    print "Error:",err.read()
+    print req
+    exit(1)
+  print res.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nag.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,24 @@
+#!/bin/python
+# Create a Clockwork object using your API key
+from clockwork import clockwork
+from sys import stdin
+from rfc822 import Message
+
+msg=Message(stdin,False)
+
+frm=msg.get('from')
+if (frm!="nagios@nagios2.skywalker.privatedns.com" and frm!='"Henry S. Thompson" <ht@inf.ed.ac.uk>'):
+  print "SMS not from nagios: %s"%frm
+  exit(1)
+
+api = clockwork.API("0a778e372c3582eeef36b5f7f580113067e82d76")
+message = clockwork.SMS( to = "447866471388",
+                         message = msg.fp.read(),
+                         from_name="Nagios")
+response = api.send(message)
+
+if response.success:
+  print "SMS sent %s"%response.id
+else:
+  print "SMS failed %s: %s"%(response.error_code,response.error_description)
+  exit(2)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngram.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,345 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2009 NLTK Project
+# Author: Steven Bird <sb@csse.unimelb.edu.au>
+# URL: <http://www.nltk.org/>
+# For license information, see LICENSE.TXT
+
+import random, types
+from itertools import chain
+from math import log
+
+from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
+                              MLEProbDist, FreqDist)
+try:
+    from nltk.util import ingrams
+except:
+    from nltkx.util import ingrams
+
+from api import *
+
+class NgramModel(ModelI):
+    """
+    A processing interface for assigning a probability to the next word.
+    """
+
+    def __init__(self, n, train, pad_left=False, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
+        """
+        Creates an ngram language model to capture patterns in n consecutive
+        words of training text.  An estimator smooths the probabilities derived
+        from the text and may allow generation of ngrams not seen during
+        training.
+
+        @param n: the order of the language model (ngram size)
+        @type n: C{int}
+        @param train: the training text
+        @type train: C{list} of C{list} of C{string} 
+        @param estimator: a function for generating a probability distribution
+        @type estimator: a function that takes a C{ConditionalFreqDist} and
+              returns a C{ConditionalProbDist}
+        @param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
+        @type pad_left: bool
+        @param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
+        @type pad_right: bool
+        @param estimator_args: Extra arguments for estimator.
+            These arguments are usually used to specify extra
+            properties for the probability distributions of individual
+            conditions, such as the number of bins they contain.
+            Note: For backward-compatibility, if no arguments are specified, the
+            number of bins in the underlying ConditionalFreqDist are passed to
+            the estimator as an argument.
+        @type estimator_args: (any)
+        @param estimator_kwargs: Extra keyword arguments for the estimator
+        @type estimator_kwargs: (any)
+        """
+        # protection from cryptic behavior for calling programs
+        # that use the pre-2.0.2 interface
+        assert(isinstance(pad_left, bool))
+        assert(isinstance(pad_right, bool))
+
+        self._n = n
+        self._W = len(train)
+        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
+        # Need _rpad even for unigrams or padded entropy will give
+        #  wrong answer because '' will be treated as unseen...
+        self._rpad = ('</s>',) * (max(1,(n - 1))) if pad_right else ()
+        self._padLen = len(self._lpad)+len(self._rpad)
+
+        self._N=0
+        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent
+
+        if estimator is None:
+            assert (estimator_args is None) and (estimator_kwargs is None),\
+                   "estimator_args or _kwargs supplied, but no estimator"
+            estimator = lambda fdist, bins: MLEProbDist(fdist)
+
+        # Given backoff, a generator isn't acceptable
+        if isinstance(train,types.GeneratorType):
+          train=list(train)
+
+        if n == 1:
+            if pad_right:
+                sents=(chain(s,self._rpad) for s in train)
+            else:
+                sents=train
+            fd=FreqDist()
+            for s in sents:
+                fd.update(s)
+            if not estimator_args and not estimator_kwargs:
+                self._model = estimator(fd,fd.B())
+            else:
+                self._model = estimator(fd,fd.B(),
+                                        *estimator_args, **estimator_kwargs)
+            self._N=fd.N()
+        else:
+            cfd = ConditionalFreqDist()
+            self._ngrams = set()
+
+            for sent in train:
+                self._N+=len(sent)+delta
+                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
+                    self._ngrams.add(ngram)
+                    context = tuple(ngram[:-1])
+                    token = ngram[-1]
+                    cfd[context][token]+=1
+            if not estimator_args and not estimator_kwargs:
+                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
+            else:
+                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
+
+        # recursively construct the lower-order models
+        if n > 1:
+            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
+                                       estimator, *estimator_args, **estimator_kwargs)
+
+            # Code below here in this method, and the _words_following and _alpha method, are from
+            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
+            self._backoff_alphas = dict()
+            # For each condition (or context)
+            #print cfd,cfd.conditions()
+            for ctxt in cfd.conditions():
+                backoff_ctxt = ctxt[1:]
+                backoff_total_pr = 0.0
+                total_observed_pr = 0.0
+
+                # this is the subset of words that we OBSERVED following
+                # this context.
+                # i.e. Count(word | context) > 0
+                wf=list(self._words_following(ctxt, cfd))
+                for word in self._words_following(ctxt, cfd):
+                    total_observed_pr += self.prob(word, ctxt)
+                    # we also need the total (n-1)-gram probability of
+                    # words observed in this n-gram context
+                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
+                assert (0 <= total_observed_pr <= 1),\
+                       "sum of probs for %s out of bounds: %s"%(ctxt,total_observed_pr)
+                # beta is the remaining probability weight after we factor out
+                # the probability of observed words.
+                # As a sanity check, both total_observed_pr and backoff_total_pr
+                # must be GE 0, since probabilities are never negative
+                beta = 1.0 - total_observed_pr
+
+                # if backoff total is 1, that should mean that all samples occur in this context,
+                #  so we will never back off.
+                # Greater than 1 is an error.
+                assert (0 <= backoff_total_pr < 1), \
+                       "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
+                alpha_ctxt = beta / (1.0 - backoff_total_pr)
+
+                self._backoff_alphas[ctxt] = alpha_ctxt
+
+    def _words_following(self, context, cond_freq_dist):
+        return cond_freq_dist[context].iterkeys()
+        # below from http://www.nltk.org/_modules/nltk/model/ngram.html,
+        # depends on new CFD???
+        #for ctxt, word in cond_freq_dist.iterkeys():
+        #    if ctxt == context:
+        #        yield word
+
+    def prob(self, word, context, verbose=False):
+        """
+        Evaluate the probability of this word in this context
+        using Katz Backoff.
+        """
+        assert(isinstance(word,types.StringTypes))
+        context = tuple(context)
+        if self._n==1:
+            if not(self._model.SUM_TO_ONE):
+                # Smoothing models should do the right thing for unigrams
+                #  even if they're 'absent'
+                return self._model.prob(word)
+            else:
+                try:
+                    return self._model.prob(word)
+                except:
+                    raise RuntimeError("No probability mass assigned"
+                                       "to unigram %s" % (word))
+        if context + (word,) in self._ngrams:
+            return self[context].prob(word)
+        else:
+            alpha=self._alpha(context)
+            if alpha>0:
+                if verbose:
+                    print "backing off for %s"%(context+(word,),)
+                return alpha * self._backoff.prob(word, context[1:],verbose)
+            else:
+                if verbose:
+                    print "no backoff for %s as model doesn't do any smoothing"%word
+                return alpha
+
+    def _alpha(self, context,verbose=False):
+        """Get the backoff alpha value for the given context
+        """
+        error_message = "Alphas and backoff are not defined for unigram models"
+        assert (not self._n == 1), error_message
+
+        if context in self._backoff_alphas:
+            res = self._backoff_alphas[context]
+        else:
+            res = 1
+        if verbose:
+            print " alpha: %s = %s"%(context,res)
+        return res
+
+
+    def logprob(self, word, context,verbose=False):
+        """
+        Evaluate the (negative) log probability of this word in this context.
+        """
+
+        return -log(self.prob(word, context,verbose), 2)
+
+    # NB, this will always start with same word since model
+    # is trained on a single text
+    def generate(self, num_words, context=()):
+        '''Generate random text based on the language model.'''
+        text = list(context)
+        for i in range(num_words):
+            text.append(self._generate_one(text))
+        return text
+
+    def _generate_one(self, context):
+        context = (self._prefix + tuple(context))[-self._n+1:]
+        # print "Context (%d): <%s>" % (self._n, ','.join(context))
+        if context in self:
+            return self[context].generate()
+        elif self._n > 1:
+            return self._backoff._generate_one(context[1:])
+        else:
+            return '.'
+
+    def entropy(self, text, pad_left=False, pad_right=False,
+                verbose=False, perItem=False):
+        """
+        Evaluate the total entropy of a text with respect to the model.
+        This is the sum of the log probability of each word in the message.
+        """
+        # This version takes account of padding for greater accuracy
+        e = 0.0
+        for ngram in ngrams(chain(self._lpad, text, self._rpad), self._n):
+            context = tuple(ngram[:-1])
+            token = ngram[-1]
+            cost=self.logprob(token, context, verbose)  # _negative_
+                                                        # log2 prob == cost!
+            if verbose:
+                print "p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost)
+            e += cost
+        if perItem:
+            return e/((len(text)+self._padLen)-(self._n - 1))
+        else:
+            return e
+
+    def dump(self, file, logBase=None, precision=7):
+        """Dump this model in SRILM/ARPA/Doug Paul format
+
+        Use logBase=10 and the default precision to get something comparable
+        to SRILM ngram-model -lm output
+        @param file to dump to
+        @type file file
+        @param logBase If not None, output logBases to the specified base
+        @type logBase int|None"""
+        file.write('\n\\data\\\n')
+        self._writeLens(file)
+        self._writeModels(file,logBase,precision,None)
+        file.write('\\end\\\n')
+
+    def _writeLens(self,file):
+        if self._n>1:
+            self._backoff._writeLens(file)
+            file.write('ngram %s=%s\n'%(self._n,
+                                        sum(len(self._model[c].samples())\
+                                            for c in self._model.keys())))
+        else:
+            file.write('ngram 1=%s\n'%len(self._model.samples()))
+            
+
+    def _writeModels(self,file,logBase,precision,alphas):
+        if self._n>1:
+            self._backoff._writeModels(file,logBase,precision,self._backoff_alphas)
+        file.write('\n\\%s-grams:\n'%self._n)
+        if self._n==1:
+            self._writeProbs(self._model,file,logBase,precision,(),alphas)
+        else:
+            for c in sorted(self._model.conditions()):
+                self._writeProbs(self._model[c],file,logBase,precision,
+                                  c,alphas)
+
+    def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas):
+        if self._n==1:
+            for k in sorted(pd.samples()+['<unk>','<s>']):
+                if k=='<s>':
+                    file.write('-99')
+                elif k=='<unk>':
+                    _writeProb(file,logBase,precision,1-pd.discount()) 
+                else:
+                    _writeProb(file,logBase,precision,pd.prob(k))
+                file.write('\t%s'%k)
+                if k not in ('</s>','<unk>'):
+                    file.write('\t')
+                    _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
+                file.write('\n')
+        else:
+            ctxtString=' '.join(ctxt)
+            for k in sorted(pd.samples()):
+                _writeProb(file,logBase,precision,pd.prob(k))
+                file.write('\t%s %s'%(ctxtString,k))
+                if alphas is not None:
+                    file.write('\t')
+                    _writeProb(file,logBase,precision,alphas[ctxt+(k,)])
+                file.write('\n')
+
+    def __contains__(self, item):
+        try:
+            return item in self._model
+        except:
+            try:
+                # hack if model is an MLEProbDist, more efficient
+                return item in self._model._freqdist
+            except:
+                return item in self._model.samples()
+
+    def __getitem__(self, item):
+        return self._model[item]
+
+    def __repr__(self):
+        return '<NgramModel with %d %d-grams>' % (self._N, self._n)
+
+def _writeProb(file,logBase,precision,p):
+    file.write('%.*g'%(precision,
+                       p if logBase is None else log(p,logBase)))
+
+def demo():
+    from nltk.corpus import brown
+    from nltk.probability import LidstoneProbDist, WittenBellProbDist
+    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
+#    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
+    lm = NgramModel(3, brown.words(categories='news'), estimator)
+    print lm
+#    print lm.entropy(sent)
+    text = lm.generate(100)
+    import textwrap
+    print '\n'.join(textwrap.wrap(' '.join(text)))
+
+if __name__ == '__main__':
+    demo()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ngram_3.0.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,300 @@
+# Natural Language Toolkit: Language Models
+#
+# Copyright (C) 2001-2014 NLTK Project
+# Authors: Steven Bird <stevenbird1@gmail.com>
+#          Daniel Blanchard <dblanchard@ets.org>
+#          Ilia Kurenkov <ilia.kurenkov@gmail.com>
+# URL: <http://nltk.org/>
+# For license information, see LICENSE.TXT
+######## Copied from http://www.nltk.org/_modules/nltk/model/ngram.html 2017-01-14
+######## Not actually part of 3.0 release
+######## "© Copyright 2015, NLTK Project. Last updated on Feb 26, 2015. Created using Sphinx 1.2.3"
+from __future__ import unicode_literals
+
+from itertools import chain
+from math import log
+
+from nltk.probability import (FreqDist,
+    ConditionalProbDist,
+    ConditionalFreqDist,
+    LidstoneProbDist)
+from nltk.util import ngrams
+from nltk.model.api import ModelI
+
+from nltk import compat
+
+
+def _estimator(fdist, *estimator_args, **estimator_kwargs):
+    """
+    Default estimator function using a SimpleGoodTuringProbDist.
+    """
+    # can't be an instance method of NgramModel as they
+    # can't be pickled either.
+    return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
+
+
+@compat.python_2_unicode_compatible
+class NgramModel(ModelI):
+    """
+    A processing interface for assigning a probability to the next word.
+    """
+
+    def __init__(self, n, train, pad_left=True, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
+        """
+        Create an ngram language model to capture patterns in n consecutive
+        words of training text.  An estimator smooths the probabilities derived
+        from the text and may allow generation of ngrams not seen during
+        training.
+
+            >>> from nltk.corpus import brown
+            >>> from nltk.probability import LidstoneProbDist
+            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
+            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
+            >>> lm
+            <NgramModel with 91603 3-grams>
+            >>> lm._backoff
+            <NgramModel with 62888 2-grams>
+            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
+            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
+            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
+            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
+            ... # doctest: +ELLIPSIS
+            0.5776...
+
+        :param n: the order of the language model (ngram size)
+        :type n: int
+        :param train: the training text
+        :type train: list(str) or list(list(str))
+        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
+        :type pad_left: bool
+        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
+        :type pad_right: bool
+        :param estimator: a function for generating a probability distribution
+        :type estimator: a function that takes a ConditionalFreqDist and
+            returns a ConditionalProbDist
+        :param estimator_args: Extra arguments for estimator.
+            These arguments are usually used to specify extra
+            properties for the probability distributions of individual
+            conditions, such as the number of bins they contain.
+            Note: For backward-compatibility, if no arguments are specified, the
+            number of bins in the underlying ConditionalFreqDist are passed to
+            the estimator as an argument.
+        :type estimator_args: (any)
+        :param estimator_kwargs: Extra keyword arguments for the estimator
+        :type estimator_kwargs: (any)
+        """
+
+        # protection from cryptic behavior for calling programs
+        # that use the pre-2.0.2 interface
+        assert(isinstance(pad_left, bool))
+        assert(isinstance(pad_right, bool))
+
+        # make sure n is greater than zero, otherwise print it
+        assert (n > 0), n
+
+        # For explicitness save the check whether this is a unigram model
+        self.is_unigram_model = (n == 1)
+        # save the ngram order number
+        self._n = n
+        # save left and right padding
+        self._lpad = ('',) * (n - 1) if pad_left else ()
+        self._rpad = ('',) * (n - 1) if pad_right else ()
+
+        if estimator is None:
+            estimator = _estimator
+
+        cfd = ConditionalFreqDist()
+
+        # set read-only ngrams set (see property declaration below to reconfigure)
+        self._ngrams = set()
+
+        # If given a list of strings instead of a list of lists, create enclosing list
+        if (train is not None) and isinstance(train[0], compat.string_types):
+            train = [train]
+
+        for sent in train:
+            raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='')
+            for ngram in raw_ngrams:
+                self._ngrams.add(ngram)
+                context = tuple(ngram[:-1])
+                token = ngram[-1]
+                cfd[(context, token)] += 1
+
+        self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs)
+
+        # recursively construct the lower-order models
+        if not self.is_unigram_model:
+            self._backoff = NgramModel(n-1, train,
+                                        pad_left, pad_right,
+                                        estimator,
+                                        *estimator_args,
+                                        **estimator_kwargs)
+
+            self._backoff_alphas = dict()
+            # For each condition (or context)
+            for ctxt in cfd.conditions():
+                backoff_ctxt = ctxt[1:]
+                backoff_total_pr = 0.0
+                total_observed_pr = 0.0
+
+                # this is the subset of words that we OBSERVED following
+                # this context.
+                # i.e. Count(word | context) > 0
+                for word in self._words_following(ctxt, cfd):
+                    total_observed_pr += self.prob(word, ctxt)
+                    # we also need the total (n-1)-gram probability of
+                    # words observed in this n-gram context
+                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
+
+                assert (0 <= total_observed_pr <= 1), total_observed_pr
+                # beta is the remaining probability weight after we factor out
+                # the probability of observed words.
+                # As a sanity check, both total_observed_pr and backoff_total_pr
+                # must be GE 0, since probabilities are never negative
+                beta = 1.0 - total_observed_pr
+
+                # backoff total has to be less than one, otherwise we get
+                # an error when we try subtracting it from 1 in the denominator
+                assert (0 <= backoff_total_pr < 1), backoff_total_pr
+                alpha_ctxt = beta / (1.0 - backoff_total_pr)
+
+                self._backoff_alphas[ctxt] = alpha_ctxt
+
+    def _words_following(self, context, cond_freq_dist):
+        for ctxt, word in cond_freq_dist.iterkeys():
+            if ctxt == context:
+                yield word
+
+    def prob(self, word, context):
+        """
+        Evaluate the probability of this word in this context using Katz Backoff.
+
+        :param word: the word to get the probability of
+        :type word: str
+        :param context: the context the word is in
+        :type context: list(str)
+        """
+        context = tuple(context)
+        if (context + (word,) in self._ngrams) or (self.is_unigram_model):
+            return self._probdist.prob((context, word))
+        else:
+            return self._alpha(context) * self._backoff.prob(word, context[1:])
+
+    def _alpha(self, context):
+        """Get the backoff alpha value for the given context
+        """
+        error_message = "Alphas and backoff are not defined for unigram models"
+        assert not self.is_unigram_model, error_message
+
+        if context in self._backoff_alphas:
+            return self._backoff_alphas[context]
+        else:
+            return 1
+
+    def logprob(self, word, context):
+        """
+        Evaluate the (negative) log probability of this word in this context.
+
+        :param word: the word to get the probability of
+        :type word: str
+        :param context: the context the word is in
+        :type context: list(str)
+        """
+        return -log(self.prob(word, context), 2)
+
+    @property
+    def ngrams(self):
+        return self._ngrams
+
+    @property
+    def backoff(self):
+        return self._backoff
+
+    @property
+    def probdist(self):
+        return self._probdist
+
+    def choose_random_word(self, context):
+        '''
+        Randomly select a word that is likely to appear in this context.
+
+        :param context: the context the word is in
+        :type context: list(str)
+        '''
+
+        return self.generate(1, context)[-1]
+
+    # NB, this will always start with same word if the model
+    # was trained on a single text
+
+    def generate(self, num_words, context=()):
+        '''
+        Generate random text based on the language model.
+
+        :param num_words: number of words to generate
+        :type num_words: int
+        :param context: initial words in generated string
+        :type context: list(str)
+        '''
+
+        text = list(context)
+        for i in range(num_words):
+            text.append(self._generate_one(text))
+        return text
+
+    def _generate_one(self, context):
+        context = (self._lpad + tuple(context))[- self._n + 1:]
+        if context in self:
+            return self[context].generate()
+        elif self._n > 1:
+            return self._backoff._generate_one(context[1:])
+        else:
+            return '.'
+
+    def entropy(self, text):
+        """
+        Calculate the approximate cross-entropy of the n-gram model for a
+        given evaluation text.
+        This is the average log probability of each word in the text.
+
+        :param text: words to use for evaluation
+        :type text: list(str)
+        """
+
+        e = 0.0
+        text = list(self._lpad) + text + list(self._rpad)
+        for i in range(self._n - 1, len(text)):
+            context = tuple(text[i - self._n + 1:i])
+            token = text[i]
+            e += self.logprob(token, context)
+        return e / float(len(text) - (self._n - 1))
+
+    def perplexity(self, text):
+        """
+        Calculates the perplexity of the given text.
+        This is simply 2 ** cross-entropy for the text.
+
+        :param text: words to calculate perplexity of
+        :type text: list(str)
+        """
+
+        return pow(2.0, self.entropy(text))
+
+    def __contains__(self, item):
+        return tuple(item) in self._probdist.freqdist
+
+    def __getitem__(self, item):
+        return self._probdist[tuple(item)]
+
+    def __repr__(self):
+        return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
+
+
+def teardown_module(module=None):
+    from nltk.corpus import brown
+    brown._unload()
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nono.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,298 @@
+#!/usr/bin/python3
+# Expects e.g. ^A copy from Nonograms dprint preview cols, then blank line, then rows
+#  rows are space-separated
+#  cols are one-digit-after-another, unless some 2-digit, in which case x is separator
+# E.g.
+# 13x1x2
+# 19
+# maps to
+# 13
+# 1  1
+# 2  9
+
+import sys
+
+Red=''
+eRed=''
+RedFmt=Red+'%s'+eRed
+
+def interleave(*args):
+  for vals in zip(*args):
+    yield from vals
+
+class Vector(list):
+  # reads top-to-bottom or left-to-right
+  def __init__(self,n,m,runs):
+    list.__init__(self,list(range(n)))
+    self.n=n
+    self.runs=runs
+    # compute the set of all possible layouts for runs
+    self.rn=len(self.runs)
+    rtot=sum(self.runs)
+    self.allRuns=list(self.seedList(0,0,0,
+                                    sum(1+self.runs[k] for k in range(self.rn))-1))
+    self.nar=len(self.allRuns)
+
+  def seedList(self,i,j,pos,runLen):
+    """
+    :param i: starting skip before next run
+    :type i: 0 if pos==0 else 1
+    :param j: next run number
+    :type j: int
+    :param pos: left margin
+    :type pos: int
+    """
+    bound=self.n-(pos+runLen)+1
+    #dprint('s',i,j,pos,runLen,bound)
+    if j==self.rn:
+        yield []
+        return
+    r=self.runs[j]
+    for v in range(i,bound):
+      for sub in self.seedList(1,j+1,pos+v+r,runLen-(r+1)):
+        yield [-v,r]+sub
+
+  def __repr__(self):
+    return "V@%s%s:%s"%(self.x,self.runs,list.__repr__(self))
+
+  def __str__(self):
+    return '%s|'%('|'.join(str(c) for c in self))
+
+  def step(self):
+    scratch=[0 if c.val is None else c.val for c in self]
+    for k,runs in enumerate(self.allRuns):
+      dprint('=====pass %s======'%k)
+      self.onepass(0,self.n,scratch,runs.copy())
+    dprint(scratch)
+    for i in range(self.n):
+      if scratch[i]==self.nar:
+        # If blobby in _every_ pass, then must be a blob
+        if self[i].val is None:
+          self[i].setVal(True)
+        elif self[i].val is True:
+          # already there
+          pass
+        else:
+          print("Shouldn't happen: attempt to blob where x already present! %s at %s"%(self,i),file=sys.stderr)
+          exit(101)
+
+  def onepass(self,i0,iBound,scratch,stack):
+    """note that stack is not a simple run, but one with _negative_ numbers between
+     and possibly before the positive ones, indicating obligatory skips
+     """
+    i=i0 # starting index into self/scratch/maybe
+    j=-1 # index into run
+    maybe=[0]*iBound
+    dprint('r: %s'%stack)
+    req=sum((-r if r<0 else r) for r in stack)
+    while stack and i<iBound:
+      r=rr=stack.pop(0)
+      dprint('pop:',r)
+      if r<1:
+        # obligatory skip
+        # (Above init of self.allRuns is easier if we allow a 0 to be ignored
+        i-=r
+        req+=r
+        r=rr=stack.pop(0)
+      # rr is run remaining -- how many we still need
+      j+=1 # index of current run in self.runs, we'll need to decorate that eventually
+      inOne=-1 # if non-neg, records the start point of a possible run
+      gapsFilled=0
+      # First, check if we can start here:  0 is OK, and n>0 iff n-1 is None or False
+      if i>0 and i<iBound:
+        while self[i-1].val:
+          i+=1
+      if (iBound-i)<req:
+        # Can't win, give up altogether
+        dprint('c0',i,iBound,req)
+        return
+      while i<iBound:
+        c=self[i].val
+        dprint('top',i,c,inOne,rr)
+        if c is None:
+          # we could add a blob here
+          dprint('c1')
+          gapsFilled+=1
+          rr-=1
+          if inOne<0:
+            dprint('c1a',i)
+            # starts here
+            inOne=i
+          # fall through to check for completion
+        else:
+          dprint('c2')
+          # c is a bool
+          if inOne<0:
+            dprint('c2a')
+            if c:
+              dprint('c2a1')
+              # a *, we can possible start something here
+              inOne=i
+              rr-=1
+              # fall through to check for completion
+            else:
+              dprint('c2a2')
+              # an x, can't start here, just move along
+              i+=1
+              continue
+          else:
+            dprint('c2b')
+            if c:
+              dprint('c2b1')
+              # a blob, extend or complete a partial
+              rr-=1
+              # fall through to check for completion
+            else:
+              # abandon a partial
+              dprint('c2b2')
+              inOne=-1
+              rr=r
+              i+=1
+              continue
+        if rr>0:
+          dprint('c3')
+          # we're not done, carry on
+          i+=1
+          continue
+        # Maybe a win?
+        # look ahead, can we stop here?
+        # NB _self_.n
+        if i+1<self.n and self[i+1].val:
+          dprint('c4')
+          # Nope
+          inOne=-1
+          rr=r
+          gapsFilled=0
+          i+=1
+          continue
+        elif gapsFilled==0:
+          dprint('c5')
+          # We must have crossed at least on gap...
+          print("Shouldn't happen: no gap! me:%s i:%s j:%s rr:%s inOne:%s"%(self,i, j, rr, inOne),file=sys.stderr)
+          exit(100)
+        # Victory!
+        dprint('c6',r,inOne,i)
+        for k in range(inOne,i+1):
+          maybe[k]+=1
+        i+=1
+        req-=r
+        break
+      # on to the next run
+    # end of inner loop, did we win?
+    if (not stack) or i==iBound:
+      # yes
+      dprint('win:',maybe)
+      for k in range(iBound):
+        scratch[k]+=maybe[k]
+
+class Row(Vector):
+  def __init__(self,n,m,runs,pos,dprintWidth):
+    Vector.__init__(self,n,m,runs)
+    self.y=pos
+    self.dprintWidth=dprintWidth
+    self.fmt="%%%ss|"%dprintWidth
+
+  def __str__(self):
+    return ((self.fmt%(' '.join(str(r) for r in self.runs)))+
+            Vector.__str__(self))
+
+class Column(Vector):
+  def __init__(self,n,m,runs,pos,dprintHeight):
+    Vector.__init__(self,n,m,runs)
+    self.x=pos
+    self.dprintHeight=dprintHeight
+    self.fmt="%%%ss"%self.dprintHeight
+    self.updateHeader()
+
+  def updateHeader(self):
+    header=('-'.join(str(c) for c in self.runs))
+    self.header=self.fmt%header # pad to same 'height'
+
+class Cell:
+  def __init__(self,row,y,column,x):
+    # At the intersection of row and column Vectors
+    self.row=row
+    self.column=column
+    self.x=x
+    self.y=y
+    self.val=None # three valued: None(unknown), True(filled), False(empty)
+    self.row[x]=self
+    self.column[y]=self
+
+  def __repr__(self):
+    return "C@(%s,%s):%s"%(self.x,self.y,self.val)
+
+  def __str__(self):
+    return ' ' if self.val is None else ('\u25A0' if self.val else 'x')
+
+  def setVal(self,v):
+    if v is True:
+      if self.val is False:
+        dprint("Warning: x -> * at %s,%s"%(self.x,self.y))
+      elif self.val is True:
+        # No-op
+        return
+      # @@ check row/col completed
+    else:
+      if self.val is not None:
+        dprint("Warning: %s -> %s at %s,%s"%(self.val,v,self.x,self.y))
+    self.val=v
+
+class Nono(dict):
+  # 0,0 is upper left, so increasing y goes _downwards_, to match the standard layout
+  def __init__(self,rows,cols):
+    n=self.n=len(cols)
+    if n!=len(rows):
+      print("losing r:%s x c:%s"%(len(rows),n),sys.stderr)
+      exit(1)
+    self.rc=rows
+    rowDprintWidth=max(sum(len(str(r)) for r in row)+len(row)-1 for row in rows)
+    self.rowfmt="%s|%%s"%(' '*rowDprintWidth)
+    self.cc=cols
+    # dprint col nums>9 vertically :-(
+    self.colDprintHeight=max(sum(len(str(c)) for c in col)+len(col)-1 for col in cols)
+    self.columns=cc=[Column(n,self,cols[i],i,self.colDprintHeight) for i in range(20)]
+    self.rows=rr=[Row(n,self,rows[i],i,rowDprintWidth) for i in range(20)]
+    for x in range(20):
+      for y in range(20):
+        self[(x,y)]=Cell(rr[y],y,cc[x],x)
+
+  def __str__(self):
+    lines=[self.rowfmt%('|'.join([(self.columns[i]).header[j] for i in range(self.n)])) # 'rotate'
+           for j in range(self.colDprintHeight)]
+    lines+=[str(r) for r in self.rows]
+    return "\n".join(lines)
+
+def dprint(*args):
+  pass
+
+if __name__ == '__main__':
+  if len(sys.argv)>1:
+    f=open(sys.argv[1])
+  else:
+    f=sys.stdin
+
+  cols=[]
+
+  for l in f:
+    l=l.rstrip()
+    if l=='':
+      break
+    if 'x' in l:
+      vv=[int(s) for s in l.split('x')]
+    else:
+      vv=[int(c) for c in l]
+    cols.append(vv)
+
+  rows=[[int(s) for s in l.split()] for l in f]
+
+  solver=Nono(rows,cols)
+  print(solver)
+  for c in solver.columns:
+    c.step()
+  print()
+  print(solver)
+  for r in solver.rows:
+    r.step()
+  print()
+  print(solver)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pdfCrawl.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,24 @@
+import PyPDF2 as pyPdf, sys
+
+f = open(sys.argv[1],'rb')
+
+pdf = pyPdf.PdfFileReader(f)
+pgs = pdf.getNumPages()
+key = '/Annots'
+uri = '/URI'
+ank = '/A'
+
+#print pdf.getNamedDestinations()
+
+for pg in range(pgs):
+    print '#',pg
+    p = pdf.getPage(pg)
+    o = p.getObject()
+    #print >>sys.stderr,o
+    if o.has_key(key):
+        ann = o[key]
+        #print >>sys.stderr,key,ann
+        for a in ann:
+            u = a.getObject()
+            if u[ank].has_key(uri):
+                print "U",u[ank][uri]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ptrace.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+# usage: ptrace.py TRACE [result of nm xemacs | egrep '[$_]']
+import sys
+symfile=open(sys.argv[2])
+syms={}
+for l in symfile:
+  (addr,rest)=l.rstrip().split(' ',1)
+  syms[addr]=rest
+symfile.close()
+trfile=open(sys.argv[1])
+
+for l in trfile:
+  (what,rest)=l.rstrip().split(' ',1)
+  if what in ('incipit','exit','p'):
+    print l.rstrip()
+    continue
+  (where,when)=rest.split()
+  try:
+    (z,b)=where.split('x')
+    print "%8s %s %s"%(what,syms["00"+b],when)
+  except:
+    print "not hex: ",l.rstrip()
+trfile.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/signif.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,162 @@
+from nltk import FreqDist
+from random import randint
+import pylab
+from math import sqrt
+
+def mean(self):
+  # Assumes the keys of this distribution are numbers!
+  return float(sum(v*self[v] for v in self.keys()))/self.N()
+
+FreqDist.mean=mean
+
+def bell(self,maxVal=None,bars=False,**kwargs):
+  # Assumes the keys of this distribution are numbers!
+  if maxVal is not None:
+    sk = sorted([k for k in self.keys() if k<=maxVal]) # range(max(self.keys())+1)
+  else:
+    sk=sorted(self.keys())
+  print len(sk)
+  #sk.append(sk[-1]+1)
+  #sk[0:0]=[(sk[0]-1)]
+  mm=0 # sk[0]
+  mean = self.mean()
+  tot = 0
+  ssd = 0
+  for v in self.keys():
+    d = v-mean
+    ssd+=d*d*self[v]
+  sd=sqrt(ssd/float(self.N()))
+  #print (mean,sd)
+  kv=[self[k] for k in sk]
+  pylab.figure().subplots_adjust(bottom=0.15)
+  pylab.plot(sk,kv,color='blue')
+  if kwargs['xtra']:
+    xtra=kwargs['xtra']
+    pylab.plot(sk,[xtra[k] for k in sk],color='red')
+  if bars:
+    pylab.bar([s-mm for s in sk],kv,
+              align='center',color='white',edgecolor='pink')
+  pylab.xticks(sk,rotation=90)
+  mv=self[self.max()]
+  bb=(-mv/10,mv+(mv/10))
+  pylab.plot((mean-mm,mean-mm),bb,
+             (mean-mm-sd,mean-mm-sd),bb,
+             (mean-mm-(2*sd),mean-mm-(2*sd)),bb,
+             (mean-mm+sd,mean-mm+sd),bb,
+             (mean-mm+(2*sd),mean-mm+(2*sd)),bb,
+             color='green')
+  pylab.xlabel("N %s, max %s\nmean %5.2f, s.d. %5.2f"%(self.N(),mv,mean, sd))
+  pylab.show()
+
+FreqDist.bell=bell
+
+def ranks(l,**kvargs):
+  # compute the rank of every element in a list
+  # uses sort, passing on all kv args
+  # uses key kv arg itself
+  # _Very_ inefficient, in several ways!
+  # Result is a pair:
+  #  list of ranks
+  #  list of tie information, each elt the magnitude of a tie group
+  s=sorted(l,**kvargs)
+  i=0
+  res=[]
+  td=[]
+  if kvargs.has_key('key'):
+    kf=kvargs['key']
+  else:
+    kf=lambda x:x
+  while i<len(l):
+    ties=[x for x in s if kf(s[i])==kf(x)]
+    if len(ties)>1:
+      td.append(len(ties))
+    r=float(i+1+(i+len(ties)))/2.0
+    for e in ties:
+      res.append((r,e))
+      i+=1
+  return (res,td)
+
+def mannWhitneyU(fd1,fd2,forceZ=False):
+  # Compute Mann Whitney U test for two frequency distributions
+  # For n1 and n2 <= 20, see http://www.soc.univ.keiv.ua/LIB/PUB/T/textual.pdf
+  #  to look up significance levels on the result: see Part 3 section 10,
+  #  actual page 150 (printed page 144)
+  # Or use http://faculty.vassar.edu/lowry/utest.html to do it for you
+  # For n1 and n2 > 20, U itself is normally distributed, we
+  #  return a tuple with a z-test value
+  # HST DOES NOT BELIEVE THIS IS CORRECT -- DOES NOT APPEAR TO GIVE CORRECT ANSWERS!!
+  r1=[(lambda x:x.append(1) or x)(list(x)) for x in fd1.items()]
+  r2=[(lambda x:x.append(2) or x)(list(x)) for x in fd2.items()]
+  n1=len(r1)
+  n2=len(r2)
+  (ar,ties)=ranks(r1+r2,key=lambda e:e[1])
+  s1=sum(r[0] for r in ar if r[1][2] is 1)
+  s2=sum(r[0] for r in ar if r[1][2] is 2)
+  u1=float(n1*n2)+(float(n1*(n1+1))/2.0)-float(s1)
+  u2=float(n1*n2)+(float(n2*(n2+1))/2.0)-float(s2)
+  u=min(u1,u2)
+  if forceZ or n1>20 or n2>20:
+    # we can treat U as sample from a normal distribution, and compute
+    # a z-score
+    # See e.g. http://mlsc.lboro.ac.uk/resources/statistics/Mannwhitney.pdf
+    mu=float(n1*n2)/2.0
+    if len(ties)>0:
+      n=float(n1+n2)
+      ts=sum((float((t*t*t)-t)/12.0) for t in ties)
+      su=sqrt((float(n1*n2)/(n*n-1))*((float((n*n*n)-n)/12.0)-ts))
+    else:
+      su=sqrt(float(n1*n2*(n1+n2+1))/12.0)
+    z=(u-mu)/su
+    return (n1,n2,u,z)
+  else:
+    return (n1,n2,u)
+
+# This started from http://dr-adorio-adventures.blogspot.com/2010/05/draft-untested.html
+#  but has a number of bug fixes
+def Rank(l,**kvargs):
+  # compute the rank of every element in a list
+  # uses sort, passing on all kv args
+  # uses key kv arg itself
+  # _Very_ inefficient, in several ways!
+  # Result is a list of pairs ( r, v) where r is a rank and v is an input value
+  s=sorted(l,**kvargs)
+  i=0
+  res=[]
+  if kvargs.has_key('key'):
+    kf=kvargs['key']
+  else:
+    kf=lambda x:x
+  while i<len(l):
+    ties=[x for x in s if kf(s[i])==kf(x)]
+    r=float(i+1+(i+len(ties)))/2.0
+    #print (i,r,ties)
+    for e in ties:
+      res.append((r,e))
+      i+=1
+  return (res)
+
+def mannWhitney(S1, S2):
+    """
+    Returns the Mann-Whitney U statistic of two samples S1 and S2.
+    """
+    # Form a single array with a categorical variable indicate the sample
+    X = [(s, 0) for s in S1]
+    X.extend([(s,1) for s in S2])
+    R = Rank(X,key=lambda x:x[0])
+ 
+    # Compute needed parameters.
+    n1 = float(len(S1))
+    n2 = float(len(S2))
+ 
+    # Compute total ranks for sample 1.          
+    R1 = sum([i for i, (x,j) in R if j == 0])
+    R2 = sum([i for i, (x,j) in R if j == 1])
+    u1 = (n1*n2)+((n1*(n1+1))/2.0)-R1
+    u2 = n1 * n2 - u1
+    U = min(u1, u2)
+    #print u1,R1/n1,R2/n2
+ 
+    mU     = n1 * n2 / 2.0
+    sigmaU = sqrt((n1 * n2 * (n1 + n2 + 1))/12.0)
+    return u1, R1/n1,R2/n2, (U-mU)/sigmaU
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/strace_summarise.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+#----------------------------------------------------------------------
+# Description : Simplify strace output to allow for easier diffing.
+# Author      : James Hunt  <james.hunt@ubuntu.com>
+# Date        : 24 July 2012
+#----------------------------------------------------------------------
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2, as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#----------------------------------------------------------------------
+
+import os
+import re
+import sys
+import string
+
+pids = {}
+
+def process_data(fh):
+  lines = fh.readlines()
+  possible_pid = 0
+  using_pids = 0
+  pid = 0
+  pid_count = 1
+
+  line_num = 0
+  for line in lines:
+
+    line = line.strip()
+    line_num += 1
+    fields = line.split()
+    if line_num % 10000 == 0:
+      print >> sys.stderr,line_num,len(fields),fields[1]
+    if len(fields) > 0:
+        result = re.match("\(?(\d{4,8})\)?", fields[1])
+        #print >> sys.stderr,result.group(),result.group(1)
+        if result and result.group(1):
+            pid = result.group(1)
+            if pid in pids:
+                line = re.sub("(\\b"+pid+"\\b)", pids[pid], line)
+            else:
+                pid_name = "PID%d" % pid_count
+                line = re.sub("(\\b"+pid+"\\b)", pid_name, line)
+                pids[pid] = pid_name
+                pid_count += 1
+
+    # handle addresses (up to 64-bit)
+    line = re.sub("0x0{1,16}", "0xNULL", line)
+    line = re.sub("0x[0-9A-Fa-f]{1,16}", "0xADDR", line)
+
+    # handle timestamps
+    line = re.sub("\d{2}:\d{2}:\d{2}", "HH:MM:SS", line)
+    line = re.sub("\d{4}/\d{2}/\d{2}", "YYYY/MM/DD", line)
+
+    print line
+
+
+def main():
+  try:
+    script = sys.argv[0]
+    file1  = sys.argv[1]
+  except:
+    sys.exit("ERROR: usage: %s <file1> " % script)
+
+  try:
+    fh1 = open(file1)
+  except:
+    sys.exit("ERROR: unable to open file '%s'" % file1)
+
+  process_data(fh1)
+  print >>sys.stderr,pids
+
+if __name__ == "__main__":
+  main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/twitter.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,113 @@
+from nltk.corpus.reader.plaintext import PlaintextCorpusReader
+from nltk.corpus.reader import RegexpTokenizer
+from nltk.tokenize import LineTokenizer
+from nltk.corpus.reader.util import read_line_block
+from nltkx.model import NgramModel
+from nltk import ConditionalFreqDist, ngrams,\
+     chain, ConditionalProbDist, WittenBellProbDist, FreqDist
+import types
+
+xtwc=PlaintextCorpusReader("/group/ltg/projects/fnlp/",
+                          r'2.*\.txt',
+                          word_tokenizer=RegexpTokenizer(r'(http|ftp|mailto)://[^\s]+|[\w#@]+|[^\w\s]+'),
+                          sent_tokenizer=LineTokenizer(),
+                          para_block_reader=read_line_block)
+
+def discount(self):
+    return float(self._N)/float(self._N + self._T)
+
+def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
+    # http://stackoverflow.com/a/33024979
+    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
+
+def check(self):
+    totProb=sum(self.prob(sample) for sample in self.samples())
+    assert isclose(self.discount(),totProb),\
+           "discount %s != totProb %s"%(self.discount(),totProb)
+           
+
+WittenBellProbDist.discount = discount
+WittenBellProbDist.check = check
+
+def _estimator(fdist, bins):
+    """
+    Default estimator function using WB.
+    """
+    # can't be an instance method of NgramModel as they
+    # can't be pickled either.
+    res=WittenBellProbDist(fdist,fdist.B()+1)
+    res.check()
+    return res
+
+class LgramModel(NgramModel):
+    def __init__(self, n, train, pad_left=False, pad_right=False,
+                 estimator=None, *estimator_args, **estimator_kwargs):
+        """
+        Same as NgramModel (q.v.), but with a WittenBell default estimator
+        """
+        if estimator is None:
+            assert (not(estimator_args)) and (not(estimator_kwargs)),\
+                   "estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
+            estimator=_estimator
+        super(LgramModel,self).__init__(n, train, pad_left, pad_right,
+                                        estimator,
+                                        *estimator_args, **estimator_kwargs)
+
+from nltk.probability import _get_kwarg
+try:
+    from nltk.probability import islice
+except:
+    from nltk.util import islice
+
+def plotSorted(self, *args, **kwargs):
+        """
+        Plot samples from the frequency distribution,
+        sorted using a supplied key function.  If an integer
+        parameter is supplied, stop after this many samples have been
+        plotted.  If two integer parameters m, n are supplied, plot a
+        subset of the samples, beginning with m and stopping at n-1.
+        For a cumulative plot, specify cumulative=True.
+        (Requires Matplotlib to be installed.)
+
+        :param title: The title for the graph
+        :type title: str
+        :param key: a function to pass to sort to extract the sort key
+          given an FD and a sample id.
+          Defaults to the value of that sample's entry,
+          lambda fd,s:fd[s]
+        :type key: function
+        :param reverse: True to sort high to low
+        :type reverse: bool
+        """
+        try:
+            import pylab
+        except ImportError:
+            raise ValueError('The plot function requires the matplotlib package (aka pylab). '
+                         'See http://matplotlib.sourceforge.net/')
+
+        if len(args) == 0:
+            args = [len(self)]
+
+        keyFn = _get_kwarg(kwargs, 'key', lambda fd,s:fd[s])
+        reverse = _get_kwarg(kwargs, 'reverse', False)
+
+        samples = list(islice(self, *args))
+        samples.sort(key=lambda x:keyFn(self,x),reverse=reverse)
+
+        freqs = [self[sample] for sample in samples]
+        ylabel = "Counts"
+        # percents = [f * 100 for f in freqs]  only in ProbDist?
+
+        pylab.grid(True, color="silver")
+        if not "linewidth" in kwargs:
+            kwargs["linewidth"] = 2
+        if "title" in kwargs:
+            pylab.title(kwargs["title"])
+            del kwargs["title"]
+        pylab.plot(freqs, **kwargs)
+        pylab.xticks(range(len(samples)), [unicode(s) for s in samples], rotation=90)
+        pylab.xlabel("Samples")
+        pylab.ylabel(ylabel)
+        pylab.show()
+
+FreqDist.plotSorted=plotSorted
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/update.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,70 @@
+#!/bin/python
+from sys import stdin,argv
+from urllib2 import Request,urlopen, HTTPError
+from base64 import b64encode
+type='xyzzy'
+l=''
+year='2014'
+
+def cc(names):
+  return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split()))
+
+cm={'tf':0,
+      'uun':1,
+      'surname':2,
+      'cat':3,
+      'stat':4,
+      'email':5,
+      'country':6,
+      'pgm':7,
+      'entry':8,
+    'forename':9}
+
+eargs=['uun']
+eargs.extend(argv[1:])
+
+while l=='':
+  l=stdin.readline().rstrip()
+if l!="\tPG Applications List for ILCC":
+  print "Not what I was expecting: %s"%l
+  exit(1)
+for l in stdin:
+  l=l.rstrip().decode('latin-1');
+  if l=='':
+    continue
+  if l.find('Count:')==0:
+    exit()
+  if l.find('Pgm Code')==0:
+    continue
+  if l.find('Entry')==len(l)-5:
+    continue
+  if l.find(type)==0:
+    continue
+  if l=='':
+    continue
+  try: #tf,uun,name,cat,stat,email,country,pgm,entry
+    vals=l.split("\t")
+  except ValueError:
+    print "Bad input: %s"%l
+    continue
+  if vals[0]!='':
+    type=vals[0]
+  try:
+    (sn,fn)=vals[2].split(", ")
+  except ValueError:
+    print "Bad input: %s"%l
+    exit
+  vals[2]=cc(sn)
+  vals.append(cc(fn))
+  attrs=" ".join(map(lambda n:'%s="%s"'%(n,vals[cm[n]]),eargs))
+  req='<update year="%s" %s/>'%(year,attrs)
+  print req
+  r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq",
+            req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+  try:
+    res=urlopen(r)
+  except HTTPError as err:
+    print "Error:",err.read()
+    print req
+    exit(1)
+  print res.read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/withdraw.py	Mon Mar 09 14:58:04 2020 +0000
@@ -0,0 +1,38 @@
+#!/bin/python
+from sys import stdin,argv
+from urllib2 import Request,urlopen, HTTPError
+from base64 import b64encode
+type='xyzzy'
+l=''
+year='2014'
+
+def cc(names):
+  return ' '.join(map(lambda n:n[0]+n[1:].lower(),names.split()))
+
+eargs=['uun']
+eargs.extend(argv[1:])
+
+while l=='':
+  l=stdin.readline().rstrip()
+for l in stdin:
+  l=l.rstrip().decode('latin-1');
+  if l=='':
+    continue
+  try: #uun,...
+    vals=l.split("\t")
+  except ValueError:
+    print "Bad input: %s"%l
+    continue
+  attrs=" ".join(map(lambda n,v:'%s="%s"'%(n,v),zip(eargs,vals)))
+  req='<update year="%s" %s/>'%(year,attrs)
+  print req
+  continue
+  r=Request("http://localhost:8080/exist/apps/phd/updateApp.xq",
+            req.encode('utf-8'),headers={'Content-Type':'application/xml;charset=UTF-8'})
+  try:
+    res=urlopen(r)
+  except HTTPError as err:
+    print "Error:",err.read()
+    print req
+    exit(1)
+  print res.read()