Mercurial > hg > ooxml
view parse.py @ 51:793496d2d165
allow command line input
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 16 May 2017 17:27:19 +0100 |
parents | e500d7c18aad |
children |
line wrap: on
line source
#!/usr/bin/python2.7 ''' Tokenise Excel formulae Starting from regexps and concat rules in https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel and parser_formule_with_textbox_v01_2003.xla linked to therein''' import sys,re cw = "=\-+*/();:,.$<>^!" #cWW = "[=\-+*/();:,.$<>^]" sListSeparator = "," sRowSeparator = ";" # If comma is decimal symbol, then \ is used in place of comma in Array. # If semi-colon is decimal symbol, then \ is used in place of semi-colon # in Array. (Semi-colon is row separator) # See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/ pats=["\"[^\"]*\"", "\{[^}]+}", sListSeparator, "[^" + cw + "]+(?:\.[^" + cw + "]+)*\\(", "\\)", "^=|\\(", "'[^']+'", #((?:'[^']+')|(?:\[[0-9]+\][^!]*)) "\$?[A-Z]+\$?[0-9]+", "[a-zA-Z_\\\\][a-zA-Z0-9._]*", "."] ## They perform the following tasks, in order: ## 1. Represents a text (delimited by double quotes) ## 2. Represents a constant matrix ## 3. Represents a list (function parameter) separator ## 4. Represents a function name followed by an opening parenthesis ## 5. Represents a closing parenthesis ## 6. Represents the beginning of the formula or an opening ## parenthesis (not part of a function) ## 7. A sheet name (delimited by single quotes) and a ! ## 8. A cell reference ## 9. A name ## 10. Each characters not matched by the previous patterns # As it stands this combines operators with following refs or vars # Should we try ref="\$?[A-Z]+\$?[0-9]+" and var=[a-zA-Z_\\][a-zA-Z0-9._]* ? # (suitably int'lised) tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE) f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))""" if len(sys.argv)>1: f=sys.argv[1] l=tokPat.findall(f) ## Visual basic code: ## set M = RE.Execute(s) ## s = "" ## For Each SM In M ## Set SB = SM.SubMatches ## If Len(SB(0) & SB(6)) Then ## t = SB(0) & SB(6) [HST doesn't understand why atoms ## and strings are concatenated] ## ElseIf Len(SB(1)) Then ## t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr ## ElseIf Len(SB(2) & SB(5)) Then ## t = SB(2) & SB(5) & vbCr ## ElseIf Len(SB(3)) Then ## t = vbCr & SB(3) & vbCr ## ElseIf Len(SB(4)) Then ## t = vbCr & SB(4) ## End If ## s = s & t ## Next def mergeMatches(l): res="" rtype=None cur="" for txt,cm,sep,ofun,close,opn,sheet,ref,var,misc in l: if txt is not '': if res is not '' and rtype is not 1: yield res rtype=1 res='' res+=txt rtype=1 continue if misc is not '': if res is not '' and rtype is not 2: yield res rtype=2 res='' res+=misc rtype=2 continue if cm is not '': cur=cm elif sep is not '': cur=sep elif ofun is not '': cur=ofun elif close is not '': cur=close elif opn is not '': cur=opn elif sheet is not '': cur=sheet elif ref is not '': cur=ref elif var is not '': cur=var if res is not '': yield res rtype=None res='' yield cur toks=list(mergeMatches(l)) print (''.join(toks))==f