Mercurial > hg > ooxml
view parse.py @ 0:ddd5f7539abc
reconstruct the original more or less
author | Henry Thompson <ht@markup.co.uk> |
---|---|
date | Fri, 24 Mar 2017 20:27:15 +0000 |
parents | |
children | 20424d7e99e4 |
line wrap: on
line source
''' Tokenise Excel formulae Starting from regexps and concat rules in https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel and parser_formule_with_textbox_v01_2003.xla linked to therein''' import sys,re cw = "[^=\-+*/();:,.$<>^]" #cWW = "[=\-+*/();:,.$<>^]" sListSeparator = "," sRowSeparator = ";" # If comma is decimal symbol, then \ is used in place of comma in Array. # If semi-colon is decimal symbol, then \ is used in place of semi-colon # in Array. (Semi-colon is row separator) # See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/ pats=["\"[^\"]*\"|'[^']*'", ## "'[^']*'!", "\{[^}]+}", sListSeparator, cw + "+(?:\." + cw + "+)*\\(", "\\)", "^=|\\(", "."] ## They perform the following tasks, in order: ## 1. Represents a text (delimited by double quotes) or a sheet ## name (delimited by single quotes) ## 2. Represents a constant matrix ## 3. Represents a list (function parameter) separator ## 4. Represents a function name followed by an opening parenthesis ## 5. Represents a closing parenthesis ## 6. Represents the beginning of the formula or an opening ## parenthesis (not part of a function) ## 7. Each characters not matched by the previous patterns tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE) f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))""" l=tokPat.findall(f) ## Visual basic code: ## set M = RE.Execute(s) ## s = "" ## For Each SM In M ## Set SB = SM.SubMatches ## If Len(SB(0) & SB(6)) Then ## t = SB(0) & SB(6) [HST doesn't understand why atoms ## and strings are concatenated] ## ElseIf Len(SB(1)) Then ## t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr ## ElseIf Len(SB(2) & SB(5)) Then ## t = SB(2) & SB(5) & vbCr ## ElseIf Len(SB(3)) Then ## t = vbCr & SB(3) & vbCr ## ElseIf Len(SB(4)) Then ## t = vbCr & SB(4) ## End If ## s = s & t ## Next def mergeMatches(l): res="" cur="" for txt,cm,sep,ofun,close,opn,misc in l: if txt is not '': res+=txt continue if misc is not '': res+=misc continue if cm is not '': cur=cm elif sep is not '': cur=sep elif ofun is not '': cur=ofun elif close is not '': cur=close elif opn is not '': cur=opn if res is not '': yield res res='' yield cur toks=list(mergeMatches(l))