# HG changeset patch # User Henry Thompson # Date 1490387235 0 # Node ID ddd5f7539abc984624e3a97733437534d8ab553e reconstruct the original more or less diff -r 000000000000 -r ddd5f7539abc parse.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse.py Fri Mar 24 20:27:15 2017 +0000 @@ -0,0 +1,91 @@ +''' Tokenise Excel formulae + Starting from regexps and concat rules in + https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel + and + parser_formule_with_textbox_v01_2003.xla + linked to therein''' + +import sys,re + +cw = "[^=\-+*/();:,.$<>^]" +#cWW = "[=\-+*/();:,.$<>^]" + +sListSeparator = "," +sRowSeparator = ";" + +# If comma is decimal symbol, then \ is used in place of comma in Array. +# If semi-colon is decimal symbol, then \ is used in place of semi-colon +# in Array. (Semi-colon is row separator) +# See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/ + +pats=["\"[^\"]*\"|'[^']*'", ## "'[^']*'!", + "\{[^}]+}", + sListSeparator, + cw + "+(?:\." + cw + "+)*\\(", + "\\)", + "^=|\\(", + "."] + +## They perform the following tasks, in order: +## 1. Represents a text (delimited by double quotes) or a sheet +## name (delimited by single quotes) +## 2. Represents a constant matrix +## 3. Represents a list (function parameter) separator +## 4. Represents a function name followed by an opening parenthesis +## 5. Represents a closing parenthesis +## 6. Represents the beginning of the formula or an opening +## parenthesis (not part of a function) +## 7. Each characters not matched by the previous patterns + +tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE) + +f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))""" + +l=tokPat.findall(f) + +## Visual basic code: +## set M = RE.Execute(s) +## s = "" +## For Each SM In M +## Set SB = SM.SubMatches +## If Len(SB(0) & SB(6)) Then +## t = SB(0) & SB(6) [HST doesn't understand why atoms +## and strings are concatenated] +## ElseIf Len(SB(1)) Then +## t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr +## ElseIf Len(SB(2) & SB(5)) Then +## t = SB(2) & SB(5) & vbCr +## ElseIf Len(SB(3)) Then +## t = vbCr & SB(3) & vbCr +## ElseIf Len(SB(4)) Then +## t = vbCr & SB(4) +## End If +## s = s & t +## Next + +def mergeMatches(l): + res="" + cur="" + for txt,cm,sep,ofun,close,opn,misc in l: + if txt is not '': + res+=txt + continue + if misc is not '': + res+=misc + continue + if cm is not '': + cur=cm + elif sep is not '': + cur=sep + elif ofun is not '': + cur=ofun + elif close is not '': + cur=close + elif opn is not '': + cur=opn + if res is not '': + yield res + res='' + yield cur + +toks=list(mergeMatches(l))