changeset 2:e07789816ca5

adding more python files from lib/python on origen
author Henry Thompson <ht@markup.co.uk>
date Mon, 09 Mar 2020 16:48:09 +0000
parents 0a3abe59e364
children 26d9c0308fcf
files csvtotsv.py hmm/semiSup.py hmm/tinySup.py mailer.py pdfComments.py req_dep.py simpleCloud.py threaDemo.py trip2xml.py trivSelenium.py wsgi_test.py
diffstat 11 files changed, 1017 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/csvtotsv.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,6 @@
+#!/usr/bin/python3
+import csv,sys
+with open(sys.argv[1],newline='') as f:
+  r=csv.reader(f)
+  for l in r:
+    print('\t'.join(s.replace("\n","") for s in l))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hmm/semiSup.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,223 @@
+'''Exploring the claim that a small dictionary can seed
+an otherwise unsupervised HMM to learn a decent POS-tagger'''
+import nltk, random, itertools
+from nltk.corpus import brown
+from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer, logsumexp2
+from nltk.probability import FreqDist,ConditionalFreqDist
+from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist
+
+def totLogProb(self,sequences):
+  N = len(self._states)
+  M = len(self._symbols)
+  logProb = 0
+  for sequence in sequences:
+    T = len(sequence)
+    # compute forward and backward probabilities
+    alpha = self._forward_probability(sequence)
+    beta = self._backward_probability(sequence)
+    # find the log probability of the sequence
+    logProb += logsumexp2(alpha[T-1])
+  return logProb
+
+HiddenMarkovModelTagger.totLogProb=totLogProb
+
+trainTagsPercent=1.0
+trainHMMPercent=0.9
+knownWordsPercent=1.0
+
+SST=SSW='<s>'
+EST=ESW='</s>'
+SS=[(SSW,SST)]
+ES=[(ESW,EST)]
+TAGSETS={
+  'univ':
+  [u'ADJ', u'ADP', u'ADV', u'CONJ', u'DET', u'NOUN', u'NUM',
+   u'PRON', u'PRT', u'VERB', u'X', u'.',SST,EST],
+  'brown':
+  [u"ABL", u"ABN", u"ABX", u"AP", u"AP$", u"AP+AP", u"AT", u"BE",
+   u"BED", u"BED*", u"BEDZ", u"BEDZ*", u"BEG", u"BEM", u"BEM*",
+   u"BEN", u"BER", u"BER*", u"BEZ", u"BEZ*", u"CC", u"CD",
+   u"CD$", u"CS", u"DO", u"DO*", u"DO+PPSS", u"DOD", u"DOD*",
+   u"DOZ", u"DOZ*", u"DT", u"DT$", u"DT+BEZ", u"DT+MD", u"DTI",
+   u"DTS", u"DTS+BEZ", u"DTX", u"EX", u"EX+BEZ", u"EX+HVD", u"EX+HVZ",
+   u"EX+MD", u"FW-*", u"FW-AT", u"FW-AT+NN", u"FW-AT+NP", u"FW-BE", u"FW-BER",
+   u"FW-BEZ", u"FW-CC", u"FW-CD", u"FW-CS", u"FW-DT", u"FW-DT+BEZ", u"FW-DTS",
+   u"FW-HV", u"FW-IN", u"FW-IN+AT", u"FW-IN+NN", u"FW-IN+NP", u"FW-JJ",
+   u"FW-JJR", u"FW-JJT", u"FW-NN", u"FW-NN$", u"FW-NNS", u"FW-NP", u"FW-NPS",
+   u"FW-NR", u"FW-OD", u"FW-PN", u"FW-PP$", u"FW-PPL", u"FW-PPL+VBZ",
+   u"FW-PPO",  u"FW-PPO+IN", u"FW-PPS", u"FW-PPSS", u"FW-PPSS+HV", u"FW-QL",
+   u"FW-RB",  u"FW-RB+CC", u"FW-TO+VB", u"FW-UH", u"FW-VB", u"FW-VBD",
+   u"FW-VBG",  u"FW-VBN", u"FW-VBZ", u"FW-WDT", u"FW-WPO", u"FW-WPS", u"HV",
+   u"HV*",  u"HV+TO", u"HVD", u"HVD*", u"HVG", u"HVN", u"HVZ", u"HVZ*", u"IN",
+   u"IN+IN",  u"IN+PPO", u"JJ", u"JJ$", u"JJ+JJ", u"JJR", u"JJR+CS", u"JJS",
+   u"JJT",  u"MD", u"MD*", u"MD+HV", u"MD+PPSS", u"MD+TO", u"NN", u"NN$",
+   u"NN+BEZ", u"NN+HVD", u"NN+HVZ", u"NN+IN", u"NN+MD", u"NN+NN", u"NNS",
+   u"NNS$", u"NNS+MD", u"NP", u"NP$", u"NP+BEZ", u"NP+HVZ", u"NP+MD",
+   u"NPS", u"NPS$", u"NR", u"NR$", u"NR+MD", u"NRS", u"OD",
+   u"PN", u"PN$", u"PN+BEZ", u"PN+HVD", u"PN+HVZ", u"PN+MD", u"PP$",
+   u"PP$$", u"PPL", u"PPLS", u"PPO", u"PPS", u"PPS+BEZ", u"PPS+HVD",
+   u"PPS+HVZ", u"PPS+MD", u"PPSS", u"PPSS+BEM", u"PPSS+BER", u"PPSS+BEZ",
+   u"PPSS+BEZ*", u"PPSS+HV", u"PPSS+HVD", u"PPSS+MD", u"PPSS+VB", u"QL",
+   u"QLP",  u"RB", u"RB$", u"RB+BEZ", u"RB+CS", u"RBR", u"RBR+CS", u"RBT",
+   u"RN",  u"RP", u"RP+IN", u"TO", u"TO+VB", u"UH", u"VB", u"VB+AT",
+   u"VB+IN", u"VB+JJ", u"VB+PPO", u"VB+RP", u"VB+TO", u"VB+VB", u"VBD",
+   u"VBG", u"VBG+TO", u"VBN", u"VBN+TO", u"VBZ", u"WDT", u"WDT+BER",
+   u"WDT+BER+PP", u"WDT+BEZ", u"WDT+DO+PPS", u"WDT+DOD", u"WDT+HVZ", u"WP$",
+   u"WPO", u"WPS", u"WPS+BEZ", u"WPS+HVD", u"WPS+HVZ", u"WPS+MD", u"WQL",
+   u"WRB", u"WRB+BER", u"WRB+BEZ", u"WRB+DO", u"WRB+DOD", u"WRB+DOD*",
+   u"WRB+DOZ", u"WRB+IN", u"WRB+MD",
+   u"(", u")", u"*", u",", u"--", u".", u":"],
+  'upenn':
+  [u"CC", u"CD", u"DT", u"EX", u"FW", u"IN", u"JJ", u"JJR", u"JJS", u"LS",
+   u"MD", u"NN", u"NNP", u"NNPS", u"NNS", u"PDT", u"POS", u"PRP", u"PRP$",
+   u"RB", u"RBR", u"RBS", u"RP", u"SYM", u"TO", u"UH", u"VB", u"VBD", u"VBG",
+   u"VBN", u"VBP", u"VBZ", u"WDT", u"WP", u"WP$", u"WRB",
+   u"``", u"$", u"''", u"(", u")", u",", u"--", u".", u":"]}
+
+TAGSETS['universal']=TAGSETS['univ']
+TAGSETS['penn']=TAGSETS['upenn']
+
+def setup(cat='news',tagset='brown',corpus=brown):
+  return ([list(itertools.chain(iter(SS),
+                                ((word.lower(),tag) for (word,tag) in s)
+                                ,iter(ES)))
+           for s in corpus.tagged_sents(categories=cat,tagset=tagset)],
+          list(itertools.chain(iter(SS), iter(ES),
+                               ((word.lower(),tag) for (word,tag) in
+                                corpus.tagged_words(categories=cat,tagset=tagset)))),
+          TAGSETS[tagset])
+
+def notCurrent(s,missList):
+  global i,n,done
+  if done or (missList[i] is not s):
+    return True
+  else:
+    i+=1
+    if i==n:
+      done=True
+    return False
+
+def splitData(words,wordPercent,sentences,sentPercent):
+  global i,n, done
+  trainWords=random.sample(words,int(wordPercent*len(words)))
+  # random.sample(sentences,int(sentPercent*len(sentences)))
+  trainSents=[s for s in sentences if random.random()<sentPercent]
+  # hack!
+  i=0
+  n=len(trainSents)
+  done=False
+  testSents=[s for s in sentences if notCurrent(s,trainSents)]
+  return trainWords, trainSents, testSents
+
+def pickWords(tagged,percent):
+  #wToT=ConditionalFreqDist(tagged)
+  tToW=ConditionalFreqDist((t,w) for (w,t) in tagged)
+  #print len(tToW[u'ADV'])
+  dd=dict((tag,(lambda wl,p=percent:\
+                wl[:int(p*len(wl))])(
+             sorted(tToW[tag].items(),key=lambda (k,v):v,reverse=True)))
+          for tag in tToW.keys())
+  return dd
+
+(tagged_s,tagged_w,tagset)=setup(tagset='universal')
+
+true_tagged_w=tagged_w[2:] # not SS, SE
+
+wordTokens=FreqDist(word for word,tag in true_tagged_w)
+wordsAsSuch=list(wordTokens.keys())
+print len(wordTokens), wordTokens.N()
+
+(trainTags,trainHMM,testHMM)=splitData(true_tagged_w,trainTagsPercent,
+                                       tagged_s,trainHMMPercent)
+
+knownWords=pickWords(trainTags,knownWordsPercent)
+
+class SubsetFreqDist(FreqDist):
+  def __init__(self,pairs,baseset,basecount=.05):
+    dict.update(self,pairs)
+    self._baseset=baseset
+    self._basecount=basecount
+    pn=sum(n for w,n in pairs)
+    self._N=pn+((len(baseset)-len(pairs))*basecount)
+
+  def __getitem__(self,key):
+    return dict.__getitem__(self,key)
+
+  def __missing__(self,key):
+    if key in self._baseset:
+      return self._basecount
+    else:
+      return 0
+
+  def N(self):
+    return self._N
+
+class Tag:
+  def __init__(self,tag,wordsAndCounts):
+    self._tag=tag
+    self._wordsAndCounts=wordsAndCounts
+    self._words=set(w for w,n in wordsAndCounts)
+    self._nTokens=sum(n for w,n in wordsAndCounts)
+    self._nTypes=len(self._words)
+
+  def words(self):
+    return self._words
+
+  def buildPD(self,allTokens):
+    self._sfd=SubsetFreqDist(self._wordsAndCounts,allTokens)
+    self._pd=MLEProbDist(self._sfd)
+
+  def getSFD(self):
+    return self._sfd
+
+  def getPD(self):
+    return self._pd
+
+class FixedTag(Tag):
+  def buildPD(self):
+    self._pd=MLEProbDist(FreqDist(dict(self._wordsAndCounts)))
+
+  def getSFD(self):
+    raise NotImplementedError("not implemented for this subclass")
+
+tags=dict((tagName,Tag(tagName,wl)) for tagName,wl in knownWords.items())
+kws=dict((tagName,tag.words()) for tagName,tag in tags.items())
+
+t2=list(filter(None,
+               ((lambda i:False if not i[1] else i)
+                (((tagset[i],tagset[j]),
+                  kws[tagset[i]].intersection(kws[tagset[j]])),)
+                for i in xrange(0,len(tagset)-2)
+                for j in xrange(i+1,len(tagset)-2))))
+
+for tag in tags.values():
+  tag.buildPD(wordTokens)
+
+tags[SST]=FixedTag(SST,[(SSW,1)])
+tags[SST].buildPD()
+tags[EST]=FixedTag(EST,[(ESW,1)])
+tags[EST].buildPD()
+
+priors = MLEProbDist(FreqDist(dict((tag,1 if tag==SST else 0) for tag in tagset)))
+
+transitions = DictionaryConditionalProbDist(
+                dict((state, RandomProbDist(tagset))
+                      for state in tagset))
+
+outputs = DictionaryConditionalProbDist(
+                dict((state, tags[state].getPD())
+                      for state in tagset))
+
+model = HiddenMarkovModelTagger(wordsAsSuch, tagset,
+                transitions, outputs, priors)
+
+print "model", model.evaluate(testHMM), model.totLogProb(testHMM)
+
+nm=HiddenMarkovModelTrainer(states=tagset,symbols=wordsAsSuch)
+
+# Note that contrary to naive reading of the documentation,
+#  train_unsupervised expects a sequence of sequences of word/tag pairs,
+#  it just ignores the tags
+nnm=nm.train_unsupervised(trainHMM,True,model=model,max_iterations=10,testMe=testHMM)
+
+print nnm.totLogProb(testHMM)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hmm/tinySup.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,66 @@
+'''Trivial test of unsupervised learning with full dictionary supplied
+See fnlp/lectures/12/hmmDNV.xlsx'''
+import nltk, random
+from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer
+from nltk.probability import FreqDist,ConditionalFreqDist
+from nltk.probability import MLEProbDist, RandomProbDist, DictionaryConditionalProbDist
+
+tagset=['<s>','D','N','V','</s>']
+symbols=['<s>','the','sheep','run','</s>']
+sents=[[('<s>','<s>'),('the','D'),('sheep','N'),('run','V'),('</s>','</s>')],
+       [('<s>','<s>'),('sheep','N'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')],
+       [('<s>','<s>'),('run','V'),('the','D'),('sheep','N'),('</s>','</s>')]]
+
+taglists=[('<s>',[('<s>',1),('the',0),('sheep',0),('run',0),('</s>',0)]),
+         ('D',[('the',1),('sheep',0),('run',0),('<s>',0),('</s>',0)]),
+         ('N',[('the',0),('sheep',.5),('run',.5),('<s>',0),('</s>',0)]),
+         ('V',[('the',0),('sheep',.5),('run',.5),('<s>',0),('</s>',0)]),
+         ('</s>',[('<s>',0),('the',0),('sheep',0),('run',0),('</s>',1)])]
+
+tagdict=dict((k,MLEProbDist(FreqDist(dict(v)))) for k,v in taglists)
+  
+priors = MLEProbDist(FreqDist({'<s>':1,
+         'D':0,
+         'N':0,
+         'V':0,
+         '</s>':0}))
+
+transitions = DictionaryConditionalProbDist(
+                dict((state, RandomProbDist(tagset))
+                      for state in tagset))
+
+outputs = DictionaryConditionalProbDist(tagdict)
+
+
+for tag in tagset:
+  cp=outputs[tag]
+  print tag,sum(cp.prob(s) for s in symbols)
+
+model = HiddenMarkovModelTagger(symbols, tagset,
+                transitions, outputs, priors)
+
+for tag in tagset:
+  cp=model._outputs[tag]
+  print tag,sum(cp.prob(s) for s in symbols)
+
+nm=HiddenMarkovModelTrainer(states=tagset,symbols=symbols)
+
+# Note that contrary to naive reading of the documentation,
+#  train_unsupervised expects a sequence of sequences of word/tag pairs,
+#  it just ignores the tags
+nnm=nm.train_unsupervised(sents,model=model,max_iterations=10,updateOutputs=False)
+
+for tag in tagset:
+  if tag=='</s>':
+    break
+  cp=nnm._transitions[tag]
+  print(("    "+4*"%6s")%tuple(tagset[1:]))
+  print(("%3s: "+4*"%6.3f")%tuple([tag]+[cp.prob(s) for s in tagset[1:]]))
+
+for tag in tagset:
+  cp=nnm._outputs[tag]
+  print(("    "+5*"%6s")%tuple(symbols))
+  x=[cp.prob(s) for s in symbols]
+  print(("%3s: "+5*"%6.3f"+"%11.4e")%tuple([tag]+x+[sum(x)]))
+
+print nnm.evaluate(sents)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mailer.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,415 @@
+#!/usr/bin/python
+'''Attempt at flexible mailout functionality
+Usage: mailer.py [-n] [-s] [-C cc string] [-c COLSPEC[,COLSPEC]*] [-B bcc string] [-b COLSPEC[,COLSPEC]*] [-S col[,col]*] [-a COLSPEC[,COLSPEC]*] [-p COLPAT]* -SA file[,file]* COLSPEC[,COLSPEC]* subject {addr-file|-} body-file
+
+Sends the body as a message from me with subject to destinations per
+lines in the addr-file selected by COLSPECs (to:) or -c/-b COLSPECs (Cc:/Bcc:)
+
+-n for dry run, prints to stdout
+-c for Cc column(s)
+-C for static Cc
+-b for Bcc columns(s)
+-B for static Bcc
+-a for attachment file column(s)
+-A for attachment file pattern column(s)
+-SA for static attachment files
+-u Use unicode for attachments
+-s for substitute into body
+-S for columns to substitute as such
+-p for augmentation pattern for a column
+
+COLSPEC is of the form a[:n[:f[:g]]] selects from addr-file, which must be tsv
+  a gives the column for an email address
+  n (optional) gives column for a name
+  f gives format for the name: FS, SF or S.F for
+      forenames surname (fornames space separated)
+      surname forenames (space separated)
+      surname, forenames (space separated)
+    default is FS
+    _ will be replaced by space in surnames
+  g gives column for gender (for pronouns), m or f
+COLPAT takes the form i:template, where i selects an address column
+and template is a string containing exactly 1 "%s", which is replaced with
+the column value to give the string which will be used for COLSPEC
+references to that column, e.g. 1:S%s@sms.ed.ac.uk
+if column 1 contains bare student numbers
+
+-s enables body substitution.  body may contain
+  %(fi)s  first forename of column i
+  %(si)s  surname
+  %(fsi)s all forenames
+  %(i)s   the undivided original and/or -S col value
+  if there is a supplied gender
+    %(pni)s 'he'/'she'
+    %(pai)s 'him'/'her'
+    %(pgi)s 'his/her'
+
+All column indices are 1-origin, as for cut'''
+
+import smtplib, sys, re, os.path, codecs
+from email.mime.text import MIMEText
+
+addrPat=re.compile("<([^>]*)>")
+
+def usage(hint=None):
+  if hint is None:
+    print __doc__
+    exit()
+  else:
+    print >>sys.stderr,"Trouble with your commandline at %s\n  %s"%(hint,
+                                                                    __doc__)
+    exit(1)
+
+def parseCols(specs,where):
+  return [Column(s,where) for s in specs.split(',')]
+
+def parsePat(spec):
+  (c,t)=spec.split(':')
+  c=int(c)
+  found=False
+  for colTab in (ccCols,bccCols,toCols,attCols):
+    if c in colTab:
+      colTab[c].addTemplate(t)
+      found=True
+  if not found:
+    print >>sys.stderr, "Warning, template supplied for column %s, but no use of the column found!"%c
+
+def addrList(addrFields,cols,att=False):
+  global someExpand
+  if att and someExpand:
+    # There were some file patterns
+    return itertools.chain(*(c.fullAddr(addrFields,True) for c in cols.values()))
+  else:
+    return [c.fullAddr(addrFields) for c in cols.values()]
+
+def addrLine(hdr,addrFields,cols):
+  return "%s: %s"%(hdr,", ".join(addrList(addrFields,cols)))
+
+def subDict(addrFields):
+  res={}
+  for c in names.values():
+    c.subDo(addrFields,res)
+  for c in subs.values():
+    if c not in names:
+      c.subDo(addrFields,res)
+  return res
+
+bccCols={}
+ccCols={}
+attCols={}
+toCols={}
+names={}
+subs={}
+CC=[]
+BCC=[]
+rawCols={}
+
+class Column:
+  _expand=False
+  def __init__(self,spec,where):
+    global names, subs
+    parts=spec.split(':')
+    if (len(parts)<1 or len(parts)>4):
+      print >>sys.stderr, "col spec. must have 1--4 :-separated parts: %s"%parts
+      usage('colspec')
+    self.a=int(parts[0])
+    if len(parts)>1:
+      self.n=int(parts[1])
+      if len(parts)>2:
+        self.f=parts[2]
+      else:
+        self.f='FS'
+      if len(parts)>3:
+        self.g=int(parts[3])
+      else:
+        self.g=None
+    else:
+      self.n=None
+    if self.a<=0:
+      print >>sys.stderr, "addr column index %s not allowed -- 1-origin indexing"%self.a
+      exit(2)
+    if self.a in where:
+      print >>sys.stderr, "duplicate column %s"%self.a
+      exit(2)
+    if self.n is not None:
+      if self.n<=0:
+        print >>sys.stderr, "name column index %s not allowed -- 1-origin indexing"%self.n
+        exit(3)
+      if self.n in where:
+        print >>sys.stderr, "can't use column %s as both name and address"%self.n
+        exit(3)
+      if self.n in names:
+        print >>sys.stderr, "attempt to redefine %s from \"%s\" to \"%s\""%(self.n,names[self.n],self)
+        exit(3)
+      if self.f not in ('FS','SF','S.F'):
+        print >>sys.stderr, "name format %s not recognised"%self.f
+        exit(4)
+    where[self.a]=self
+    if self.n is not None:
+      if isinstance(self,RawColumn):
+        subs[self.n]=self
+      else:
+        names[self.n]=self
+
+  def __str__(self):
+    if self.n is None:
+      return str(self.a)
+    else:
+      return "%s:%s"%(self.a,self.n)
+
+  def __repr__(self):
+    return str(self)
+
+  def addTemplate(self,template):
+    try:
+      print >>sys.stderr,"Attempt to overwrite existing template \"%s\" for %s with \"%s\""%(self.template,
+                                                                                             self.a,
+                                                                                             template)
+    except AttributeError:
+      self.template=template
+    
+  def name(self):
+    return self.n
+
+  def expAddr(self,fields):
+    addr=fields[self.a-1]
+    try:
+      return self.template%addr
+    except AttributeError:
+      return addr
+
+  def fullAddr(self,fields,att=False):
+    global someExpand
+    if self.n is None:
+      res=self.expAddr(fields)
+      if att and someExpand:
+        if self._expand:
+          return glob.iglob(res)
+        else:
+          return [res]
+      else:
+        return res
+    else:
+      return '"%s" <%s>'%(fields[self.n-1].replace('_',' '),self.expAddr(fields))    
+
+  def subDo(self,addrFields,dict):
+    f=addrFields[self.n-1]
+    dict[str(self.n)]=f
+    nparts=f.split(' ')
+    if self.f=='FS':
+      sur=nparts.pop()
+    elif self.f=='SF':
+      sur=nparts.pop(0)
+    elif self.f=='S.F':
+      sur=nparts.pop(0)[:-1]
+    fores=nparts
+    dict['fs%s'%self.n]=' '.join(fores)
+    dict['f%s'%self.n]=fores[0]
+    dict['s%s'%self.n]=sur.replace('_',' ')
+    if self.g is not None:
+      gg=addrFields[self.g-1]
+      if gg=='m':
+        dict['pn%s'%self.n]='he'
+        dict['pa%s'%self.n]='him'
+        dict['pg%s'%self.n]='his'
+      elif gg=='f':
+        dict['pn%s'%self.n]='she'
+        dict['pa%s'%self.n]='her'
+        dict['pg%s'%self.n]='her'
+      else:
+        print >>sys.stderr,"Warning, unrecognised gender in column %s: %s"%(self.n,gg)
+
+  def setExpand(self):
+    self._expand=True
+
+class RawColumn(Column):
+  '''Not for person names, just raw text'''
+  
+  def subDo(self,addrFields,dict):
+    f=addrFields[self.n-1]
+    dict[str(self.n)]=f
+
+def doAtt(msg,att,codec):
+  (mt,enc)=mimetypes.guess_type(att)
+  (tp,subtp)=mt.split('/',2)
+  if tp=='text':
+    attf=codecs.open(att,'r',codec)
+    atm=MIMEText(attf.read(),subtp,codec)
+  elif tp=='application':
+    from email.mime.application import MIMEApplication
+    attf=open(att,'r')
+    atm=MIMEApplication(attf.read(),subtp)
+  else:
+    print >>sys.stderr, "Help: Media type %s (for attachment %s) not supported"%(mt,att)
+    exit(5)
+  atm.add_header('Content-Disposition','attachment',
+                 filename=os.path.basename(att))
+  msg.attach(atm)
+
+dryrun=False
+sys.argv.pop(0)
+doSub=False
+pats=[]
+someExpand=False
+codec='iso-8859-1'
+staticAtts=[]
+while sys.argv:
+  if sys.argv[0]=='-n':
+    dryrun=True
+    sys.argv.pop(0)
+  elif sys.argv[0]=='-c' and ccCols=={}:
+    sys.argv.pop(0)
+    if sys.argv:
+      parseCols(sys.argv.pop(0),ccCols)
+    else:
+      usage('cc')
+  elif sys.argv[0]=='-C' and CC==[]:
+    sys.argv.pop(0)
+    if sys.argv:
+      CC=sys.argv.pop(0).split(',')
+    else:
+      usage('CC')
+  elif sys.argv[0]=='-b' and bccCols=={}:
+    sys.argv.pop(0)
+    if sys.argv:
+      parseCols(sys.argv.pop(0),bccCols)
+    else:
+      usage('bcc')
+  elif sys.argv[0]=='-B' and BCC==[]:
+    sys.argv.pop(0)
+    if sys.argv:
+      BCC=sys.argv.pop(0).split(',')
+    else:
+      usage('BCC')
+  elif sys.argv[0] in ('-a','-A','-SA'): # and attCols=={}
+    expand=sys.argv[0]=='-A'
+    static=sys.argv[0]=='-SA'
+    sys.argv.pop(0)
+    if sys.argv:
+      if static:
+        staticAtts=sys.argv.pop(0).split(',')
+      else:
+        pc=parseCols(sys.argv.pop(0),attCols)
+        if expand:
+          import itertools, glob
+          someExpand=True
+          for c in pc:
+            c.setExpand()
+      from email.mime.multipart import MIMEMultipart
+      import mimetypes
+    else:
+      usage('attachment')
+  elif sys.argv[0]=='-u':
+    sys.argv.pop(0)
+    codec='utf-8'
+  elif sys.argv[0]=='-s':
+    sys.argv.pop(0)
+    doSub=True
+  elif sys.argv[0]=='-S' and rawCols=={}:
+    sys.argv.pop(0)
+    if sys.argv:
+      for c in sys.argv.pop(0).split(','):
+        RawColumn("%s:%s"%(c,c),rawCols)
+    else:
+      usage('raw subs')
+  elif sys.argv[0]=='-p':
+    sys.argv.pop(0)
+    if sys.argv:
+      pats.append(sys.argv.pop(0))
+    else:
+      usage('pat')
+  elif sys.argv[0][0]=='-':
+    print sys.argv
+    usage()
+  else:
+    break
+
+if sys.argv:
+  parseCols(sys.argv.pop(0),toCols)
+else:
+  usage('to')
+
+pats=[parsePat(p) for p in pats]
+
+if sys.argv:
+  subj=sys.argv.pop(0)
+else:
+  usage('subj')
+
+if sys.argv:
+  af=sys.argv.pop(0)
+  if af=='-':
+    addrFile=sys.stdin
+  else:
+    try:
+      addrFile=open(af,'r')
+    except:
+      usage('addr: %s'%sys.exc_value)
+else:
+  usage('addr')
+
+if sys.argv:
+  bf=sys.argv.pop(0)
+  try:
+    bodyFile=open(bf,'r')
+  except:
+    usage('body: %s'%sys.exc_value)
+else:
+  usage('body')
+
+try:
+  sig=open("/home/ht/.signature","r")
+  signature=sig.read().rstrip()
+except:
+  signature=None
+
+CS=', '
+body=bodyFile.read().rstrip()
+if not dryrun:
+  mailer=smtplib.SMTP()
+  mailer.connect()
+for l in addrFile:
+  addrFields=l.rstrip().split('\t')
+  if doSub:
+    bodyPlus=body%subDict(addrFields)
+  else:
+    bodyPlus=body
+  if signature is not None:
+    bodyPlus+="\n--\n"
+    bodyPlus+=signature
+  if attCols or staticAtts:
+    msg=MIMEMultipart()
+    msg.attach(MIMEText(bodyPlus))
+  else:
+    msg=MIMEText(bodyPlus)
+  #to=addrLine("To",addrFields,toCols)
+  to=addrList(addrFields,toCols)
+  #msg=to
+  #recips=addrPat.findall(to)
+  msg['To']=CS.join(to)
+  recips=[]+list(to)
+  cc=CC
+  if ccCols:
+    cc+=addrList(addrFields,ccCols)
+  if cc!=[]:
+    msg["Cc"]=CS.join(cc)
+    recips+=list(cc)
+  bcc=BCC
+  if bccCols:
+    bcc+=addrList(addrFields,bccCols)
+  if bcc!=[]:
+    msg["Bcc"]=CS.join(bcc)
+    recips+=list(bcc)
+  msg["Subject"]=subj
+  for att in staticAtts:
+    doAtt(msg,att,codec)
+  if attCols:
+    for att in addrList(addrFields,attCols,True):
+      doAtt(msg,att,codec)
+  if dryrun:
+    print recips
+    print msg.as_string()
+    exit()
+  print "mailing to %s"%recips
+  mailer.sendmail("ht@inf.ed.ac.uk",recips,msg.as_string())
+mailer.quit()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pdfComments.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,30 @@
+import PyPDF2 as pyPdf, sys
+
+if sys.argv[1]=='-v':
+    verbose=True
+    sys.argv.pop(1)
+else:
+    verbose=False
+
+f = open(sys.argv[1],'rb')
+
+pdf = pyPdf.PdfFileReader(f)
+pgs = pdf.getNumPages()
+key = '/Annots'
+uri = '/URI'
+ank = '/A'
+
+#print pdf.getNamedDestinations()
+
+for pg in range(pgs):
+    print '#',pg
+    p = pdf.getPage(pg)
+    o = p.getObject()
+    #print o.keys()
+    if o.has_key(key):
+        ann = o[key]
+        #print key,ann
+        for a in ann:
+            u = a.getObject()
+            if '/Contents' in u:
+                print "%s: %s"%(u['/Subtype'],u['/Contents'])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/req_dep.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,48 @@
+#!/usr/bin/python2.7
+import sys
+kids={}
+parents={}
+known=set(())
+l=sys.stdin.readline()
+while l:
+  ff=l.strip().split()
+  if len(ff)!=0:
+    d=ff[0]
+    sys.stdin.readline()
+    pp=sys.stdin.readline().strip().replace(',','').split()[2:]
+    parents[d]=pp
+    known.add(d)
+    for p in pp:
+      known.add(p)
+      try:
+        c=kids[p]
+        c.add(d)
+      except KeyError:
+        c=set([d])
+        kids[p]=c
+  l=sys.stdin.readline()
+roots={}
+still=set(())
+for k in known:
+  if k in parents:
+    still.add(k)
+  else:
+    roots[k]=set([k])
+#print len(parents),len(kids),len(roots),len(still),roots
+print roots.keys()
+while len(still)>0:
+  #print len(still),len(roots)
+  pend=still
+  still=set([])
+  while len(pend)>0:
+    x=pend.pop()
+    #print x,len(pend)
+    for p in parents[x]:
+      try:
+        r=roots[p]
+        roots[x]=roots.get(x,set(())).union(r)
+      except KeyError:
+        if p in parents:
+          still.add(x)
+for x,r in sorted(roots.items()):
+  print x,r
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/simpleCloud.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+"""
+Minimal Example
+===============
+Generating a square wordcloud from a text file using default arguments.
+"""
+
+import sys
+from os import path
+from wordcloud import WordCloud
+from matplotlib import use
+use('pdf')
+
+# Read the whole text.
+text = open(sys.argv[1]).read()
+
+# Generate a word cloud image
+wordcloud = WordCloud().generate(text)
+
+# Display the generated image:
+# the matplotlib way:
+import matplotlib.pyplot as plt
+plt.imshow(wordcloud, interpolation='bilinear')
+plt.axis("off")
+
+# lower max_font_size
+wordcloud = WordCloud(max_font_size=40).generate(text)
+plt.figure()
+plt.imshow(wordcloud, interpolation="bilinear")
+plt.axis("off")
+plt.savefig('/tmp/cloud.pdf')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/threaDemo.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,39 @@
+#!/usr/bin/python3
+import threading
+from queue import Queue
+import time
+
+# lock to serialize console output
+lock = threading.Lock()
+
+def do_work(item):
+    time.sleep(.1) # pretend to do some lengthy work.
+    # Make sure the whole print completes or threads can mix up output in one line.
+    with lock:
+        print(threading.current_thread().name,item)
+
+# The worker thread pulls an item from the queue and processes it
+def worker():
+    while True:
+        item = q.get()
+        do_work(item)
+        q.task_done()
+
+# Create the queue and thread pool.
+q = Queue()
+for i in range(4):
+     t = threading.Thread(target=worker)
+     t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
+     t.start()
+
+# stuff work items on the queue (in this case, just a number).
+start = time.perf_counter()
+for item in range(20):
+    q.put(item)
+
+q.join()       # block until all tasks are done
+
+# "Work" took .1 seconds per task.
+# 20 tasks serially would be 2 seconds.
+# With 4 threads should be about .5 seconds (contrived because non-CPU intensive "work")
+print('time:',time.perf_counter() - start)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trip2xml.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+'''Convert a screen-scrape from Check My Trip to diary-style XML'''
+import re, sys, datetime
+year="2016"
+leg=re.compile(year+"TO")
+duration=re.compile("([0-9][0-9]) ([A-Za-z]*) %s .*duration"%year)
+plusOne=re.compile(" [+]1 day$")
+flight=re.compile("\\\\| *([^ ]*) *confirmed")
+dep=re.compile("^Dep: (.*)")
+arr=re.compile("^Arr: (.*)")
+CS=", "
+
+cleg=None
+class Leg:
+  def __init__(self):
+    self.flights=[]
+    self.p1=False
+    self.dd=None
+
+  def addFlight(self,flight):
+    self.flights.append(flight)
+
+  def setDD(self,m):
+    print 'sdd'
+    td="%s-%s-%s"%(year,m.group(2),m.group(1))
+    self.ddate=datetime.datetime.strptime(td,"%Y-%B-%d").date()
+    self.dd=self.ddate.isoformat()
+    if self.p1:
+      self.ads=" -- %s"%(self.ddate+datetime.timedelta(1)).isoformat()
+    else:
+      self.ads=""
+
+  def setPlusOne(self):
+    print 'p1'
+    self.p1=True
+
+  def __str__(self):
+    fa=self.flights[0].fa
+    ta=self.flights[-1].ta
+    if len(self.flights)>1:
+      va=" via "+CS.join([f.fa for f in self.flights[1:]])
+    else:
+      va=""
+    ff=CS.join([f.fn for f in self.flights])
+    tt=CS.join(["%s--%s"%(f.dt,f.at) for f in self.flights])
+    return "<item term='%s%s'>%s->%s%s; %s; %s</item>"%(self.dd,self.ads,fa,ta,va,ff,tt)
+
+class Flight():
+  def __init__(self,fn):
+    self.fn=fn # flight number
+
+  def setDep(self,ds):
+    dss=ds.split('|')
+    self.dt=dss[0][:2]+dss[0][3:5]
+    self.fa=dss[1].split()[-1]
+
+  def setArr(self,ax):
+    ass=ax.split('|')
+    self.at=ass[0][:2]+ass[0][3:5]
+    if (ass[0].find('(+1 day)')==6):
+      self.at+="+1"
+    self.ta=ass[1].split()[-1]
+    
+print "<list type='defn'>"
+for l in sys.stdin:
+  if leg.search(l):
+    if cleg is not None:
+      print cleg
+    cleg=Leg()
+  m = flight.search(l)
+  if m:
+    fl=Flight(m.group(1))
+    cleg.addFlight(fl)
+    continue
+  if ((cleg is not None) and
+      (cleg.dd is None)):
+    m=duration.search(l)
+    if m:
+      cleg.setDD(m)
+      continue
+  m=plusOne.search(l)
+  if m:
+    cleg.setPlusOne()
+    continue
+  m=dep.search(l)
+  if m:
+    fl.setDep(m.group(1))
+    continue
+  m=arr.search(l)
+  if m:
+    fl.setArr(m.group(1))
+print cleg
+print "</list>"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trivSelenium.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,41 @@
+# To install the Python client library:
+# pip install -U selenium
+
+# Import the Selenium 2 namespace (aka "webdriver")
+from selenium import webdriver
+
+# iPhone
+#driver = webdriver.Remote(browser_name="iphone", command_executor='http://172.24.101.36:3001/hub')
+
+# Android
+#driver = webdriver.Remote(browser_name="android", command_executor='http://127.0.0.1:8080/hub')
+
+# Google Chrome 
+#driver = webdriver.Chrome()
+
+# Firefox 
+driver = webdriver.Firefox()
+
+# ------------------------------
+# The actual test scenario: Test the codepad.org code execution service.
+
+# Go to codepad.org
+driver.get('http://codepad.org')
+
+# Select the Python language option
+python_link = driver.find_elements_by_xpath("//input[@name='lang' and @value='Python']")[0]
+python_link.click()
+
+# Enter some text!
+text_area = driver.find_element_by_id('textarea')
+text_area.send_keys("print 'Hello,' + ' World!'")
+
+# Submit the form!
+submit_button = driver.find_element_by_name('submit')
+submit_button.click()
+
+# Make this an actual test. Isn't Python beautiful?
+assert "Hello, World!" in driver.get_page_source()
+
+# Close the browser!
+driver.quit()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wsgi_test.py	Mon Mar 09 16:48:09 2020 +0000
@@ -0,0 +1,25 @@
+from wsgiref.simple_server import make_server
+
+# Every WSGI application must have an application object - a callable
+# object that accepts two arguments. For that purpose, we're going to
+# use a function (note that you're not limited to a function, you can
+# use a class for example). The first argument passed to the function
+# is a dictionary containing CGI-style envrironment variables and the
+# second variable is the callable object (see PEP 333).
+n = 0
+def hello_world_app(environ, start_response):
+    global n
+    status = '200 OK' # HTTP Status
+    headers = [('Content-type', 'text/plain')] # HTTP Headers
+    start_response(status, headers)
+
+    # The returned object is going to be printed
+    n=n+1
+    return ["Hello World %s"%n]
+
+httpd = make_server('', 8000, hello_world_app)
+print "Serving on port 8000..."
+
+# Serve until process is killed
+httpd.serve_forever()
+f