Mercurial > hg > ooxml

#!/usr/bin/python2.7
''' Tokenise Excel formulae
    Starting from regexps and concat rules in
    https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel
   and
    parser_formule_with_textbox_v01_2003.xla
   linked to therein'''

import sys,re

cw = "=\-+*/();:,.$<>^!"
#cWW = "[=\-+*/();:,.$<>^]"

sListSeparator = ","
sRowSeparator = ";"

# If comma is decimal symbol, then \ is used in place of comma in Array.
# If semi-colon is decimal symbol, then \ is used in place of semi-colon
#   in Array. (Semi-colon is row separator)
# See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/

pats=["\"[^\"]*\"",
      "\{[^}]+}",
      sListSeparator,
      "[^" + cw + "]+(?:\.[^" + cw + "]+)*\\(",
      "\\)",
      "^=|\\(",
      "'[^']+'",   #((?:'[^']+')|(?:\[[0-9]+\][^!]*))
      "\$?[A-Z]+\$?[0-9]+",
      "[a-zA-Z_\\\\][a-zA-Z0-9._]*",
      "."]

## They perform the following tasks, in order:
## 1.    Represents a text (delimited by double quotes)
## 2.    Represents a constant matrix
## 3.    Represents a list (function parameter) separator
## 4.    Represents a function name followed by an opening parenthesis
## 5.    Represents a closing parenthesis
## 6.    Represents the beginning of the formula or an opening
##        parenthesis (not part of a function)
## 7.    A sheet name (delimited by single quotes) and a !
## 8.    A cell reference
## 9.    A name
## 10.   Each characters not matched by the previous patterns

# As it stands this combines operators with following refs or vars
# Should we try ref="\$?[A-Z]+\$?[0-9]+" and var=[a-zA-Z_\\][a-zA-Z0-9._]* ?
#  (suitably int'lised)

tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE)

f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))"""

if len(sys.argv)>1:
  f=sys.argv[1]

l=tokPat.findall(f)

##  Visual basic code:
##     set M = RE.Execute(s)
##     s = ""
##     For Each SM In M
##         Set SB = SM.SubMatches
##         If Len(SB(0) & SB(6)) Then
##             t = SB(0) & SB(6)       [HST doesn't understand why atoms
##                                      and strings are concatenated]
##         ElseIf Len(SB(1)) Then
##             t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr
##         ElseIf Len(SB(2) & SB(5)) Then
##             t = SB(2) & SB(5) & vbCr
##         ElseIf Len(SB(3)) Then
##             t = vbCr & SB(3) & vbCr
##         ElseIf Len(SB(4)) Then
##             t = vbCr & SB(4)
##         End If
##         s = s & t
##     Next

def mergeMatches(l):
  res=""
  rtype=None
  cur=""
  for txt,cm,sep,ofun,close,opn,sheet,ref,var,misc in l:
    if txt is not '':
      if res is not '' and rtype is not 1:
        yield res
        rtype=1
        res=''
      res+=txt
      rtype=1
      continue
    if misc is not '':
      if res is not '' and rtype is not 2:
        yield res
        rtype=2
        res=''
      res+=misc
      rtype=2
      continue
    if cm is not '':
      cur=cm
    elif sep is not '':
      cur=sep
    elif ofun is not '':
      cur=ofun
    elif close is not '':
      cur=close
    elif opn is not '':
      cur=opn
    elif sheet is not '':
      cur=sheet
    elif ref is not '':
      cur=ref
    elif var is not '':
      cur=var
    if res is not '':
      yield res
      rtype=None
      res=''
    yield cur

toks=list(mergeMatches(l))
print (''.join(toks))==f
author	Henry S. Thompson <ht@markup.co.uk>
date	Tue, 16 May 2017 17:27:19 +0100
parents	e500d7c18aad
children