changeset 37:ac3cd8de7a10

towards big rework of tokenisation
author Henry S. Thompson <ht@markup.co.uk>
date Tue, 25 Apr 2017 18:30:04 +0100
parents ae605b77d1e4
children 468a6cf8bf0b
files notes.txt refs.xsl tokenise.xsl
diffstat 3 files changed, 173 insertions(+), 121 deletions(-) [+]
line wrap: on
line diff
--- a/notes.txt	Tue Apr 25 12:24:31 2017 +0100
+++ b/notes.txt	Tue Apr 25 18:30:04 2017 +0100
@@ -1,3 +1,33 @@
+Tokenisation patterns, derived from parse.py, derived from 
+ https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel
+   and
+ parser_formule_with_textbox_v01_2003.xla
+   linked to therein
+
+1 ("[^"]*") q
+  A text (delimited by double quotes) 
+2 (\{[^}]+}) m
+  A constant matrix
+3 (,) c
+  A list (function parameter) separator
+4 ([^=\-+*/();:,.$&lt;>^!]+(?:\.[^=\-+*/();:,.$&lt;>^!]+)*\() f
+  A function name followed by an opening parenthesis
+5 ([)]) p
+  A closing parenthesis
+6 (^=|\() l
+  The beginning of the formula or an opening
+         parenthesis (not part of a function)
+7 ((?:(?:'[^']+')|(?:\[[0-9]+\][^!]*)|(?:[a-zA-Z_][a-zA-Z0-9._]*)!)) n
+  A sheet name (either delimited by single quotes, or
+                bracketed number plus optional string,
+                or simple name (syntax is a _guess_)) 
+8 (\$?[A-Z]+\$?[0-9]+) s or r
+  A cell reference
+9 ([a-zA-Z_\\][a-zA-Z0-9._]*) v
+  A name (always for a variable?)
+10 (.) x
+  Single characters not matched by the previous patterns
+----------
 You can't depend on 
   <f si="..." t="shared"/>
  That is, it's _true_, but you can have a table with shared formulae
@@ -65,7 +95,10 @@
 references FIXED
  Solo local vars are recursively dereferenced
  The definition table is in workbook.xml definedNames/definedName[@name=$name]/.
-  Sheet name to filename mapping for locals is in workbook.xml sheets/sheet[@name=$sname]/@sheetId
+  Sheet name to filename mapping for locals is in workbook.xml
+    sheets/sheet[@name=$sname]/@sheetId
+    These appear in definedName, single-quoted if (iff?) the sheet name has spaces
+      (or other specials?)
  ??? Variables on l or r of ranges are just looked up: if they are complex
   no recursion is done: the _semantics_ of this case are not clear to
   me, need a real-life example... 
--- a/refs.xsl	Tue Apr 25 12:24:31 2017 +0100
+++ b/refs.xsl	Tue Apr 25 18:30:04 2017 +0100
@@ -1,125 +1,6 @@
 <?xml version='1.0'?>
-<!DOCTYPE doc SYSTEM "../../../lib/xml/xsl.dtd" >
+<!DOCTYPE xsl:stylesheet SYSTEM "../../../lib/xml/xsl.dtd" >
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s e xf" xmlns="http://markup.co.uk/excel" xmlns:xf="http://www.w3.org/2005/xpath-functions">
- <xsl:param name="sheet-number"/>
- <xsl:param name="xlDir"/>
- 
- <xsl:include href="a2n.xsl"/>
-
-  <xsl:variable name="pat1">("[^"]*")|(\{[^}]+})|(,)|([^=\-+*/();:,.$&lt;>^!]+(?:\.[^=\-+*/();:,.$&lt;>^!]+)*\()|([)])|(^=|\()|((?:'[^']+')|(?:\[[0-9]+\][^!]*))|(\$?[A-Z]+\$?[0-9]+)|([a-zA-Z_\\][a-zA-Z0-9._]*)|(.)</xsl:variable>
- <xsl:param name="pat" select="$pat1"/><!-- xsl:param for refinement debugging by passing in the pattern -->
- 
- <xsl:variable name="workbook" select="document(concat($xlDir,'/workbook.xml'))/*"/>
- <xsl:variable name="sheet-name" select="$workbook/s:sheets/s:sheet[@sheetId=$sheet-number]/@name"/>
- 
- <xsl:function name="e:lookup" as="xs:string*">
-  <xsl:param name="name" as="xs:string" required="yes"/>
-  <xsl:variable name="defn" select="$workbook/s:definedNames/s:definedName[@name=$name]"/>
-  <xsl:sequence select="let $prefix := concat($sheet-name,'!')
-                   return if ($defn and
-                              starts-with($defn,$prefix))
-                           then substring-after($defn,$prefix)
-                           else ()"/>
- </xsl:function>
-
- <xsl:function name="e:tokenise" as="array(element(*)*)*">
-  <!-- Tokenise a formula, recursively wrt variables -->
-  <xsl:param name="formula" as="xs:string" required="yes"/>
-  <!-- The row and column number of the cell whence the formula came -->
-  <xsl:param name="row" required="yes" as="xs:int"/>
-  <xsl:param name="col" required="yes" as="xs:int"/>
-  <xsl:sequence select="
-    let $tokens := analyze-string($formula,$pat)/xf:match/xf:group
-     return if ($tokens[@nr=(7,8,9)])
-             then 
-              let $n := count($tokens),
-                  $vars := for $i in (1 to $n) return
-                         let $t := $tokens[$i],
-                             $l := $tokens[$i - 1],
-                             $r := $tokens[$i + 1] return    
-                          if ($t/@nr=9 and
-                              not($l[@nr=10 and
-                                     .=(':','!')]) and
-                              not($r[@nr=10 and .=':']))
-                            then string($t)
-                            else (),
-                  $defns := for $var in $vars return e:lookup($var),
-                  $recur := for $sub in $defns 
-                              return if ($sub) then e:tokenise($sub,$row,$col)
-                                               else (),
-                  $singles := for $i in (1 to $n) return
-                            let $t := $tokens[$i],
-                                $l := $tokens[$i - 1],
-                                $r := $tokens[$i + 1] return
-                            if ($t/@nr=8 and
-                                not($l[@nr=10 and
-                                       .=(':','!')]) and
-                                not($r[@nr=10 and .=':']))
-                             then e:single($t,$row,$col,false())
-                             else (),
-                  $ranges := for $i in (1 to count($tokens)) return
-                            let $t := $tokens[$i] return
-                            if ($t[@nr=10 and .=':' and
-                                   not($i gt 2 and
-                                       $tokens[$i - 2][@nr=10 and .='!'])])
-                             then let $l := $tokens[$i - 1],
-                                      $r := $tokens[$i + 1]
-                                      return e:range(e:single($l,
-                                                             $row,$col,false()),
-                                                     e:single($r,
-                                                             $row,$col,false()))
-                             else (),
-                  $externals := for $i in (1 to count($tokens)) return
-                            let $t := $tokens[$i] return
-                            if ($t/@nr=7 and $tokens[$i+1]='!')
-                             then 
-                              let $ext := $t!='[0]',
-                                  $ref := e:single($tokens[$i + 2],
-                                                   $row,$col,$ext),
-                                  $res := if ((($i+3) le $n) and
-                                              $tokens[$i + 3][@nr=10 and .=':'])
-                                           then e:range($ref,
-                                                        e:single($tokens[$i+4],
-                                                                $row,$col,$ext))
-                                           else $ref return
-                              if ($ext)
-                               then e:external($t,$res)
-                               else $res
-                             else ()
-                  return [($singles,for $a in $recur return $a?1),
-                          ($ranges,for $a in $recur return $a?2),
-                          ($externals,for $a in $recur return $a?3)]
-             else ()"/>
- </xsl:function>
- 
- <xsl:function name="e:single" as="element(*)">
-  <xsl:param name="group" as="element(xf:group)"/>
-  <xsl:param name="row" as="xs:integer"/>
-  <xsl:param name="col" as="xs:integer"/>
-  <xsl:param name="external" as="xs:boolean"/>
-  <xsl:variable name="val" select="if ($group/@nr=9) then e:lookup($group)
-                                             else string($group)"/>
-  <xsl:choose>
-   <xsl:when test="count($val)>0 or not($external)">
-    <xsl:sequence select="e:cr($val,$row,$col)"/>
-   </xsl:when>
-   <xsl:otherwise>
-    <v><xsl:value-of select="$group"/></v>
-   </xsl:otherwise>
-  </xsl:choose>  
- </xsl:function>
- 
- <xsl:function name="e:range" as="element(e:r)">
-  <xsl:param name="l" as="element(e:s)" required="yes"/>
-  <xsl:param name="r" as="element(e:s)" required="yes"/>
-  <r><xsl:copy-of select="$l"/><xsl:copy-of select="$r"/></r>
- </xsl:function>
- 
- <xsl:function name="e:external" as="element(e:e)">
-  <xsl:param name="source" as="element(xf:group)" required="yes"/>
-  <xsl:param name="ref" as="element(*)" required="yes"/>
-  <e s="{$source}"><xsl:sequence select="$ref"/></e>
- </xsl:function>
 
  <xsl:template match="/">
   <refs sheetName="{$sheet-name}"><xsl:apply-templates select="//s:c"/></refs>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tokenise.xsl	Tue Apr 25 18:30:04 2017 +0100
@@ -0,0 +1,138 @@
+<?xml version='1.0'?>
+<!DOCTYPE xsl:stylesheet SYSTEM "../../../lib/xml/xsl.dtd" >
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s e xf" xmlns="http://markup.co.uk/excel" xmlns:xf="http://www.w3.org/2005/xpath-functions">
+ <xsl:param name="sheet-number"/>
+ <xsl:param name="xlDir"/>
+ 
+ <xsl:include href="a2n.xsl"/>
+
+  <xsl:variable name="pat1">("[^"]*")|(\{[^}]+})|(,)|([^=\-+*/();:,.$&lt;>^!]+(?:\.[^=\-+*/();:,.$&lt;>^!]+)*\()|([)])|(^=|\()|((?:(?:'[^']+')|(?:\[[0-9]+\][^!]*)|(?:[a-zA-Z_][a-zA-Z0-9._]*)!))|(\$?[A-Z]+\$?[0-9]+)|([a-zA-Z_\\][a-zA-Z0-9._]*)|(.)</xsl:variable>
+ <xsl:param name="pat" select="$pat1"/><!-- xsl:param for refinement debugging by passing in the pattern -->
+ 
+ <xsl:variable name="workbook" select="document(concat($xlDir,'/workbook.xml'))/*"/>
+ <xsl:variable name="sheet-name" select="$workbook/s:sheets/s:sheet[@sheetId=$sheet-number]/@name"/>
+ 
+ <xsl:function name="e:lookup" as="xs:string*">
+  <xsl:param name="name" as="xs:string" required="yes"/>
+  <xsl:variable name="defn" select="$workbook/s:definedNames/s:definedName[@name=$name]"/>
+  <xsl:sequence select="let $prefix := concat($sheet-name,'!')
+                   return if ($defn and
+                              starts-with($defn,$prefix))
+                           then substring-after($defn,$prefix)
+                           else ()"/>
+ </xsl:function>
+
+ <xsl:function name="e:tokenise" as="element(*)*">
+  <!-- Tokenise a formula, recursively wrt variables
+       Output is composed of e:* as follows:
+       c: A list (function parameter) separator
+       e: An external (variable, cell or range) reference
+       f: A function name followed by an opening parenthesis
+       l: The beginning of the formula or an opening paren
+       m: A constant matrix
+       p: A close-paren
+       q: A text (delimited by double quotes) 
+       r: A range reference
+       s: A single-cell reference
+       v: A variable name [should only occur inside e]
+       x: Amalgamated single characters not matched by anything else
+ -->
+  <xsl:param name="formula" as="xs:string" required="yes"/>
+  <!-- The row and column number of the cell whence the formula came -->
+  <xsl:param name="row" required="yes" as="xs:int"/>
+  <xsl:param name="col" required="yes" as="xs:int"/>
+  <xsl:sequence select="
+     let $tokens := analyze-string($formula,$pat)/xf:match/xf:group
+        return e:tok1($tokens,count($tokens),1,$row,$col,())"/>
+ </xsl:function>
+ 
+ <xsl:function name="e:tok1" as="element(*)*">
+  <xsl:param name="tokens" as="element(xf:group)*" required="yes"/>
+  <xsl:param name="n" required="yes" as="xs:int"/>
+  <xsl:param name="i" required="yes" as="xs:int"/>
+  <xsl:param name="row" required="yes" as="xs:int"/>
+  <xsl:param name="col" required="yes" as="xs:int"/>
+  <xsl:param name="soFar" required="yes" as="element(*)*"/>
+  <xsl:sequence select="
+    if ($i gt $n)
+          then $soFar
+          else 
+            let $next := e:expand($tokens,$i,true(),$row,$col),
+                $j := $next?1,
+                $res := $next?2 return
+            e:tok1($tokens,$n,$j,$row,$col,($soFar,$res))"/>
+ </xsl:function>
+ 
+ <xsl:function name="e:expand" as="element(*)*">
+  <xsl:param name="tokens" required="yes" as="element(xf:group)*"/>
+  <xsl:param name="i" required="yes" as="xs:int"/>
+  <xsl:param name="local" required="yes" as="xs:boolean"/>
+  <xsl:param name="row" required="yes" as="xs:int"/>
+  <xsl:param name="col" required="yes" as="xs:int"/>
+  <xsl:sequence select="
+    let $t := $tokens[$i],
+        $r := $tokens[$i + 1] return
+     if ($t/@nr=1) then e:exp1($i,'q',string($t))
+     else if ($t/@nr=2) then e:exp1($i,'m',string($t))
+     else if ($t/@nr=3) then e:exp1($i,'c',',')
+     else if ($t/@nr=4) then e:exp1($i,'f',string($t))
+     else if ($t/@nr=5) then e:exp1($i,'p',')')
+     else if ($t/@nr=6) then e:exp1($i,'l',string($t))
+     else if ($t/@nr=7)
+       then if (substring-before($t,'!')=('[0]',$sheet-name))
+              then (: it's a local reference after all :)
+               e:expand($tokens,$i+1,true(),$row,$col)
+              else let $ext := e:expand($tokens,$i+1,false(),$row,$col) return
+                    [$ext?1,e:external($ext?2)]
+     else if ($t/@nr=10) then e:amalgamate($tokens,$i+1,string($t))
+     else if ($r[@nr=10 and .=':'])
+       then (: a range, takes priority :)
+          e:range($tokens,$i,$ext,$row,$col)
+     else if ($t/@nr=8) then e:single($i,$ext,string($t))
+     else if ($t/@nr=9)
+       then if ($ext) then (: can't expand :) e:exp1($i,'v',string($t))
+       else e:tokenise(e:lookup(string($t)),$row,$col)
+     else (-- shouldn't ever get here --) ()"/>
+ </xsl:function>
+ 
+ <xsl:function name="e:exp1" as="array(*)">
+  <xsl:param name="i" as="xs:int"/>
+  <xsl:param name="name" as="xs:string"/>
+  <xsl:param name="val" as="xs:string"/>
+  <xsl:variable name="elt">
+   <xsl:element name="{$name}" namespace="http://markup.co.uk/excel">
+    <xsl:value-of select="$val"/>
+   </xsl:element>
+  </xsl:variable>
+  <xsl:sequence select="[$i+1,$elt]"/>
+ </xsl:function>
+ 
+ <xsl:function name="e:single" as="element(*)">
+  <xsl:param name="group" as="element(xf:group)"/>
+  <xsl:param name="row" as="xs:integer"/>
+  <xsl:param name="col" as="xs:integer"/>
+  <xsl:param name="external" as="xs:boolean"/>
+  <xsl:variable name="val" select="if ($group/@nr=9) then e:lookup($group)
+                                             else string($group)"/>
+  <xsl:choose>
+   <xsl:when test="count($val)>0 or not($external)">
+    <xsl:sequence select="e:cr($val,$row,$col)"/>
+   </xsl:when>
+   <xsl:otherwise>
+    <v><xsl:value-of select="$group"/></v>
+   </xsl:otherwise>
+  </xsl:choose>  
+ </xsl:function>
+ 
+ <xsl:function name="e:range" as="element(e:r)">
+  <xsl:param name="l" as="element(e:s)" required="yes"/>
+  <xsl:param name="r" as="element(e:s)" required="yes"/>
+  <r><xsl:copy-of select="$l"/><xsl:copy-of select="$r"/></r>
+ </xsl:function>
+ 
+ <xsl:function name="e:external" as="element(e:e)">
+  <xsl:param name="source" as="element(xf:group)" required="yes"/>
+  <xsl:param name="ref" as="element(*)" required="yes"/>
+  <e s="{$source}"><xsl:sequence select="$ref"/></e>
+ </xsl:function>
+</xsl:stylesheet>