view parse.py @ 1:20424d7e99e4

handle sheet names more carefully
author Henry S. Thompson <ht@markup.co.uk>
date Fri, 24 Mar 2017 22:41:28 +0000
parents ddd5f7539abc
children 263a1935d37d
line wrap: on
line source

''' Tokenise Excel formulae
    Starting from regexps and concat rules in
    https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel
   and
    parser_formule_with_textbox_v01_2003.xla
   linked to therein'''

import sys,re

cw = "=\-+*/();:,.$<>^!"
#cWW = "[=\-+*/();:,.$<>^]"

sListSeparator = ","
sRowSeparator = ";"

# If comma is decimal symbol, then \ is used in place of comma in Array.
# If semi-colon is decimal symbol, then \ is used in place of semi-colon
#   in Array. (Semi-colon is row separator)
# See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/

pats=["\"[^\"]*\"",
      "\{[^}]+}",
      sListSeparator,
      "[^" + cw + "]+(?:\.[^" + cw + "]+)*\\(",
      "\\)",
      "^=|\\(",
      "'[^']+'!",
      "."]

## They perform the following tasks, in order:
## 1.    Represents a text (delimited by double quotes) 
## 2.    Represents a constant matrix
## 3.    Represents a list (function parameter) separator
## 4.    Represents a function name followed by an opening parenthesis
## 5.    Represents a closing parenthesis
## 6.    Represents the beginning of the formula or an opening
##        parenthesis (not part of a function)
## 7.    A sheet name (delimited by single quotes) and a !
## 8.    Each characters not matched by the previous patterns


tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE)

f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))"""

l=tokPat.findall(f)

##  Visual basic code:
##     set M = RE.Execute(s)
##     s = ""
##     For Each SM In M
##         Set SB = SM.SubMatches
##         If Len(SB(0) & SB(6)) Then
##             t = SB(0) & SB(6)       [HST doesn't understand why atoms
##                                      and strings are concatenated]
##         ElseIf Len(SB(1)) Then
##             t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr
##         ElseIf Len(SB(2) & SB(5)) Then
##             t = SB(2) & SB(5) & vbCr
##         ElseIf Len(SB(3)) Then
##             t = vbCr & SB(3) & vbCr
##         ElseIf Len(SB(4)) Then
##             t = vbCr & SB(4)
##         End If
##         s = s & t
##     Next

def mergeMatches(l):
  res=""
  rtype=None
  cur=""
  for txt,cm,sep,ofun,close,opn,sheet,misc in l:
    if txt is not '':
      if res is not '' and rtype is not 1:
        yield res
        rtype=1
        res=''
      res+=txt
      rtype=1
      continue
    if misc is not '' or sheet is not '':
      if res is not '' and rtype is not 2:
        yield res
        rtype=2
        res=''
      res+=misc if misc is not '' else sheet
      rtype=2
      continue
    if cm is not '':
      cur=cm
    elif sep is not '':
      cur=sep
    elif ofun is not '':
      cur=ofun
    elif close is not '':
      cur=close
    elif opn is not '':
      cur=opn
    if res is not '':
      yield res
      rtype=None
      res=''
    yield cur

toks=list(mergeMatches(l))