Mercurial > hg > ooxml
view parse.py @ 1:20424d7e99e4
handle sheet names more carefully
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 24 Mar 2017 22:41:28 +0000 |
parents | ddd5f7539abc |
children | 263a1935d37d |
line wrap: on
line source
''' Tokenise Excel formulae Starting from regexps and concat rules in https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel and parser_formule_with_textbox_v01_2003.xla linked to therein''' import sys,re cw = "=\-+*/();:,.$<>^!" #cWW = "[=\-+*/();:,.$<>^]" sListSeparator = "," sRowSeparator = ";" # If comma is decimal symbol, then \ is used in place of comma in Array. # If semi-colon is decimal symbol, then \ is used in place of semi-colon # in Array. (Semi-colon is row separator) # See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/ pats=["\"[^\"]*\"", "\{[^}]+}", sListSeparator, "[^" + cw + "]+(?:\.[^" + cw + "]+)*\\(", "\\)", "^=|\\(", "'[^']+'!", "."] ## They perform the following tasks, in order: ## 1. Represents a text (delimited by double quotes) ## 2. Represents a constant matrix ## 3. Represents a list (function parameter) separator ## 4. Represents a function name followed by an opening parenthesis ## 5. Represents a closing parenthesis ## 6. Represents the beginning of the formula or an opening ## parenthesis (not part of a function) ## 7. A sheet name (delimited by single quotes) and a ! ## 8. Each characters not matched by the previous patterns tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE) f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))""" l=tokPat.findall(f) ## Visual basic code: ## set M = RE.Execute(s) ## s = "" ## For Each SM In M ## Set SB = SM.SubMatches ## If Len(SB(0) & SB(6)) Then ## t = SB(0) & SB(6) [HST doesn't understand why atoms ## and strings are concatenated] ## ElseIf Len(SB(1)) Then ## t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr ## ElseIf Len(SB(2) & SB(5)) Then ## t = SB(2) & SB(5) & vbCr ## ElseIf Len(SB(3)) Then ## t = vbCr & SB(3) & vbCr ## ElseIf Len(SB(4)) Then ## t = vbCr & SB(4) ## End If ## s = s & t ## Next def mergeMatches(l): res="" rtype=None cur="" for txt,cm,sep,ofun,close,opn,sheet,misc in l: if txt is not '': if res is not '' and rtype is not 1: yield res rtype=1 res='' res+=txt rtype=1 continue if misc is not '' or sheet is not '': if res is not '' and rtype is not 2: yield res rtype=2 res='' res+=misc if misc is not '' else sheet rtype=2 continue if cm is not '': cur=cm elif sep is not '': cur=sep elif ofun is not '': cur=ofun elif close is not '': cur=close elif opn is not '': cur=opn if res is not '': yield res rtype=None res='' yield cur toks=list(mergeMatches(l))