changeset 0:ddd5f7539abc

reconstruct the original more or less
author Henry Thompson <ht@markup.co.uk>
date Fri, 24 Mar 2017 20:27:15 +0000
parents
children 20424d7e99e4
files parse.py
diffstat 1 files changed, 91 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parse.py	Fri Mar 24 20:27:15 2017 +0000
@@ -0,0 +1,91 @@
+''' Tokenise Excel formulae
+    Starting from regexps and concat rules in
+    https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel
+   and
+    parser_formule_with_textbox_v01_2003.xla
+   linked to therein'''
+
+import sys,re
+
+cw = "[^=\-+*/();:,.$<>^]"
+#cWW = "[=\-+*/();:,.$<>^]"
+
+sListSeparator = ","
+sRowSeparator = ";"
+
+# If comma is decimal symbol, then \ is used in place of comma in Array.
+# If semi-colon is decimal symbol, then \ is used in place of semi-colon
+#   in Array. (Semi-colon is row separator)
+# See https://www.ablebits.com/office-addins-blog/2015/02/25/array-formulas-functions-excel/
+
+pats=["\"[^\"]*\"|'[^']*'", ##      "'[^']*'!",
+      "\{[^}]+}",
+      sListSeparator,
+      cw + "+(?:\." + cw + "+)*\\(",
+      "\\)",
+      "^=|\\(",
+      "."]
+
+## They perform the following tasks, in order:
+## 1.    Represents a text (delimited by double quotes) or a sheet
+##        name (delimited by single quotes)
+## 2.    Represents a constant matrix
+## 3.    Represents a list (function parameter) separator
+## 4.    Represents a function name followed by an opening parenthesis
+## 5.    Represents a closing parenthesis
+## 6.    Represents the beginning of the formula or an opening
+##        parenthesis (not part of a function)
+## 7.    Each characters not matched by the previous patterns
+
+tokPat=re.compile("("+(")|(".join(pats))+")",re.IGNORECASE)
+
+f="""=IF(I$1>$C$2,VLOOKUP($C52,GP_input!$C$187:$CM$196,VLOOKUP($C$9,lists!$A$34:$B$39,2,FALSE)+BC$4,FALSE),SUMIF('BEX2011'!$C$32:$C$5000,$B$9&"sl"&$B52&$C52&$B53,'BEX2001'!Q$32))"""
+
+l=tokPat.findall(f)
+
+##  Visual basic code:
+##     set M = RE.Execute(s)
+##     s = ""
+##     For Each SM In M
+##         Set SB = SM.SubMatches
+##         If Len(SB(0) & SB(6)) Then
+##             t = SB(0) & SB(6)       [HST doesn't understand why atoms
+##                                      and strings are concatenated]
+##         ElseIf Len(SB(1)) Then
+##             t = Array_Const_Wrap(SB(1), sRowSeparator) & vbCr
+##         ElseIf Len(SB(2) & SB(5)) Then
+##             t = SB(2) & SB(5) & vbCr
+##         ElseIf Len(SB(3)) Then
+##             t = vbCr & SB(3) & vbCr
+##         ElseIf Len(SB(4)) Then
+##             t = vbCr & SB(4)
+##         End If
+##         s = s & t
+##     Next
+
+def mergeMatches(l):
+  res=""
+  cur=""
+  for txt,cm,sep,ofun,close,opn,misc in l:
+    if txt is not '':
+      res+=txt
+      continue
+    if misc is not '':
+      res+=misc
+      continue
+    if cm is not '':
+      cur=cm
+    elif sep is not '':
+      cur=sep
+    elif ofun is not '':
+      cur=ofun
+    elif close is not '':
+      cur=close
+    elif opn is not '':
+      cur=opn
+    if res is not '':
+      yield res
+      res=''
+    yield cur
+
+toks=list(mergeMatches(l))