Mercurial > hg > ooxml
changeset 37:ac3cd8de7a10
towards big rework of tokenisation
author | Henry S. Thompson <ht@markup.co.uk> |
---|---|
date | Tue, 25 Apr 2017 18:30:04 +0100 |
parents | ae605b77d1e4 |
children | 468a6cf8bf0b |
files | notes.txt refs.xsl tokenise.xsl |
diffstat | 3 files changed, 173 insertions(+), 121 deletions(-) [+] |
line wrap: on
line diff
--- a/notes.txt Tue Apr 25 12:24:31 2017 +0100 +++ b/notes.txt Tue Apr 25 18:30:04 2017 +0100 @@ -1,3 +1,33 @@ +Tokenisation patterns, derived from parse.py, derived from + https://sites.google.com/site/e90e50/random-topics/tool-for-parsing-formulas-in-excel + and + parser_formule_with_textbox_v01_2003.xla + linked to therein + +1 ("[^"]*") q + A text (delimited by double quotes) +2 (\{[^}]+}) m + A constant matrix +3 (,) c + A list (function parameter) separator +4 ([^=\-+*/();:,.$<>^!]+(?:\.[^=\-+*/();:,.$<>^!]+)*\() f + A function name followed by an opening parenthesis +5 ([)]) p + A closing parenthesis +6 (^=|\() l + The beginning of the formula or an opening + parenthesis (not part of a function) +7 ((?:(?:'[^']+')|(?:\[[0-9]+\][^!]*)|(?:[a-zA-Z_][a-zA-Z0-9._]*)!)) n + A sheet name (either delimited by single quotes, or + bracketed number plus optional string, + or simple name (syntax is a _guess_)) +8 (\$?[A-Z]+\$?[0-9]+) s or r + A cell reference +9 ([a-zA-Z_\\][a-zA-Z0-9._]*) v + A name (always for a variable?) +10 (.) x + Single characters not matched by the previous patterns +---------- You can't depend on <f si="..." t="shared"/> That is, it's _true_, but you can have a table with shared formulae @@ -65,7 +95,10 @@ references FIXED Solo local vars are recursively dereferenced The definition table is in workbook.xml definedNames/definedName[@name=$name]/. - Sheet name to filename mapping for locals is in workbook.xml sheets/sheet[@name=$sname]/@sheetId + Sheet name to filename mapping for locals is in workbook.xml + sheets/sheet[@name=$sname]/@sheetId + These appear in definedName, single-quoted if (iff?) the sheet name has spaces + (or other specials?) ??? Variables on l or r of ranges are just looked up: if they are complex no recursion is done: the _semantics_ of this case are not clear to me, need a real-life example...
--- a/refs.xsl Tue Apr 25 12:24:31 2017 +0100 +++ b/refs.xsl Tue Apr 25 18:30:04 2017 +0100 @@ -1,125 +1,6 @@ <?xml version='1.0'?> -<!DOCTYPE doc SYSTEM "../../../lib/xml/xsl.dtd" > +<!DOCTYPE xsl:stylesheet SYSTEM "../../../lib/xml/xsl.dtd" > <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s e xf" xmlns="http://markup.co.uk/excel" xmlns:xf="http://www.w3.org/2005/xpath-functions"> - <xsl:param name="sheet-number"/> - <xsl:param name="xlDir"/> - - <xsl:include href="a2n.xsl"/> - - <xsl:variable name="pat1">("[^"]*")|(\{[^}]+})|(,)|([^=\-+*/();:,.$<>^!]+(?:\.[^=\-+*/();:,.$<>^!]+)*\()|([)])|(^=|\()|((?:'[^']+')|(?:\[[0-9]+\][^!]*))|(\$?[A-Z]+\$?[0-9]+)|([a-zA-Z_\\][a-zA-Z0-9._]*)|(.)</xsl:variable> - <xsl:param name="pat" select="$pat1"/><!-- xsl:param for refinement debugging by passing in the pattern --> - - <xsl:variable name="workbook" select="document(concat($xlDir,'/workbook.xml'))/*"/> - <xsl:variable name="sheet-name" select="$workbook/s:sheets/s:sheet[@sheetId=$sheet-number]/@name"/> - - <xsl:function name="e:lookup" as="xs:string*"> - <xsl:param name="name" as="xs:string" required="yes"/> - <xsl:variable name="defn" select="$workbook/s:definedNames/s:definedName[@name=$name]"/> - <xsl:sequence select="let $prefix := concat($sheet-name,'!') - return if ($defn and - starts-with($defn,$prefix)) - then substring-after($defn,$prefix) - else ()"/> - </xsl:function> - - <xsl:function name="e:tokenise" as="array(element(*)*)*"> - <!-- Tokenise a formula, recursively wrt variables --> - <xsl:param name="formula" as="xs:string" required="yes"/> - <!-- The row and column number of the cell whence the formula came --> - <xsl:param name="row" required="yes" as="xs:int"/> - <xsl:param name="col" required="yes" as="xs:int"/> - <xsl:sequence select=" - let $tokens := analyze-string($formula,$pat)/xf:match/xf:group - return if ($tokens[@nr=(7,8,9)]) - then - let $n := count($tokens), - $vars := for $i in (1 to $n) return - let $t := $tokens[$i], - $l := $tokens[$i - 1], - $r := $tokens[$i + 1] return - if ($t/@nr=9 and - not($l[@nr=10 and - .=(':','!')]) and - not($r[@nr=10 and .=':'])) - then string($t) - else (), - $defns := for $var in $vars return e:lookup($var), - $recur := for $sub in $defns - return if ($sub) then e:tokenise($sub,$row,$col) - else (), - $singles := for $i in (1 to $n) return - let $t := $tokens[$i], - $l := $tokens[$i - 1], - $r := $tokens[$i + 1] return - if ($t/@nr=8 and - not($l[@nr=10 and - .=(':','!')]) and - not($r[@nr=10 and .=':'])) - then e:single($t,$row,$col,false()) - else (), - $ranges := for $i in (1 to count($tokens)) return - let $t := $tokens[$i] return - if ($t[@nr=10 and .=':' and - not($i gt 2 and - $tokens[$i - 2][@nr=10 and .='!'])]) - then let $l := $tokens[$i - 1], - $r := $tokens[$i + 1] - return e:range(e:single($l, - $row,$col,false()), - e:single($r, - $row,$col,false())) - else (), - $externals := for $i in (1 to count($tokens)) return - let $t := $tokens[$i] return - if ($t/@nr=7 and $tokens[$i+1]='!') - then - let $ext := $t!='[0]', - $ref := e:single($tokens[$i + 2], - $row,$col,$ext), - $res := if ((($i+3) le $n) and - $tokens[$i + 3][@nr=10 and .=':']) - then e:range($ref, - e:single($tokens[$i+4], - $row,$col,$ext)) - else $ref return - if ($ext) - then e:external($t,$res) - else $res - else () - return [($singles,for $a in $recur return $a?1), - ($ranges,for $a in $recur return $a?2), - ($externals,for $a in $recur return $a?3)] - else ()"/> - </xsl:function> - - <xsl:function name="e:single" as="element(*)"> - <xsl:param name="group" as="element(xf:group)"/> - <xsl:param name="row" as="xs:integer"/> - <xsl:param name="col" as="xs:integer"/> - <xsl:param name="external" as="xs:boolean"/> - <xsl:variable name="val" select="if ($group/@nr=9) then e:lookup($group) - else string($group)"/> - <xsl:choose> - <xsl:when test="count($val)>0 or not($external)"> - <xsl:sequence select="e:cr($val,$row,$col)"/> - </xsl:when> - <xsl:otherwise> - <v><xsl:value-of select="$group"/></v> - </xsl:otherwise> - </xsl:choose> - </xsl:function> - - <xsl:function name="e:range" as="element(e:r)"> - <xsl:param name="l" as="element(e:s)" required="yes"/> - <xsl:param name="r" as="element(e:s)" required="yes"/> - <r><xsl:copy-of select="$l"/><xsl:copy-of select="$r"/></r> - </xsl:function> - - <xsl:function name="e:external" as="element(e:e)"> - <xsl:param name="source" as="element(xf:group)" required="yes"/> - <xsl:param name="ref" as="element(*)" required="yes"/> - <e s="{$source}"><xsl:sequence select="$ref"/></e> - </xsl:function> <xsl:template match="/"> <refs sheetName="{$sheet-name}"><xsl:apply-templates select="//s:c"/></refs>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tokenise.xsl Tue Apr 25 18:30:04 2017 +0100 @@ -0,0 +1,138 @@ +<?xml version='1.0'?> +<!DOCTYPE xsl:stylesheet SYSTEM "../../../lib/xml/xsl.dtd" > +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s e xf" xmlns="http://markup.co.uk/excel" xmlns:xf="http://www.w3.org/2005/xpath-functions"> + <xsl:param name="sheet-number"/> + <xsl:param name="xlDir"/> + + <xsl:include href="a2n.xsl"/> + + <xsl:variable name="pat1">("[^"]*")|(\{[^}]+})|(,)|([^=\-+*/();:,.$<>^!]+(?:\.[^=\-+*/();:,.$<>^!]+)*\()|([)])|(^=|\()|((?:(?:'[^']+')|(?:\[[0-9]+\][^!]*)|(?:[a-zA-Z_][a-zA-Z0-9._]*)!))|(\$?[A-Z]+\$?[0-9]+)|([a-zA-Z_\\][a-zA-Z0-9._]*)|(.)</xsl:variable> + <xsl:param name="pat" select="$pat1"/><!-- xsl:param for refinement debugging by passing in the pattern --> + + <xsl:variable name="workbook" select="document(concat($xlDir,'/workbook.xml'))/*"/> + <xsl:variable name="sheet-name" select="$workbook/s:sheets/s:sheet[@sheetId=$sheet-number]/@name"/> + + <xsl:function name="e:lookup" as="xs:string*"> + <xsl:param name="name" as="xs:string" required="yes"/> + <xsl:variable name="defn" select="$workbook/s:definedNames/s:definedName[@name=$name]"/> + <xsl:sequence select="let $prefix := concat($sheet-name,'!') + return if ($defn and + starts-with($defn,$prefix)) + then substring-after($defn,$prefix) + else ()"/> + </xsl:function> + + <xsl:function name="e:tokenise" as="element(*)*"> + <!-- Tokenise a formula, recursively wrt variables + Output is composed of e:* as follows: + c: A list (function parameter) separator + e: An external (variable, cell or range) reference + f: A function name followed by an opening parenthesis + l: The beginning of the formula or an opening paren + m: A constant matrix + p: A close-paren + q: A text (delimited by double quotes) + r: A range reference + s: A single-cell reference + v: A variable name [should only occur inside e] + x: Amalgamated single characters not matched by anything else + --> + <xsl:param name="formula" as="xs:string" required="yes"/> + <!-- The row and column number of the cell whence the formula came --> + <xsl:param name="row" required="yes" as="xs:int"/> + <xsl:param name="col" required="yes" as="xs:int"/> + <xsl:sequence select=" + let $tokens := analyze-string($formula,$pat)/xf:match/xf:group + return e:tok1($tokens,count($tokens),1,$row,$col,())"/> + </xsl:function> + + <xsl:function name="e:tok1" as="element(*)*"> + <xsl:param name="tokens" as="element(xf:group)*" required="yes"/> + <xsl:param name="n" required="yes" as="xs:int"/> + <xsl:param name="i" required="yes" as="xs:int"/> + <xsl:param name="row" required="yes" as="xs:int"/> + <xsl:param name="col" required="yes" as="xs:int"/> + <xsl:param name="soFar" required="yes" as="element(*)*"/> + <xsl:sequence select=" + if ($i gt $n) + then $soFar + else + let $next := e:expand($tokens,$i,true(),$row,$col), + $j := $next?1, + $res := $next?2 return + e:tok1($tokens,$n,$j,$row,$col,($soFar,$res))"/> + </xsl:function> + + <xsl:function name="e:expand" as="element(*)*"> + <xsl:param name="tokens" required="yes" as="element(xf:group)*"/> + <xsl:param name="i" required="yes" as="xs:int"/> + <xsl:param name="local" required="yes" as="xs:boolean"/> + <xsl:param name="row" required="yes" as="xs:int"/> + <xsl:param name="col" required="yes" as="xs:int"/> + <xsl:sequence select=" + let $t := $tokens[$i], + $r := $tokens[$i + 1] return + if ($t/@nr=1) then e:exp1($i,'q',string($t)) + else if ($t/@nr=2) then e:exp1($i,'m',string($t)) + else if ($t/@nr=3) then e:exp1($i,'c',',') + else if ($t/@nr=4) then e:exp1($i,'f',string($t)) + else if ($t/@nr=5) then e:exp1($i,'p',')') + else if ($t/@nr=6) then e:exp1($i,'l',string($t)) + else if ($t/@nr=7) + then if (substring-before($t,'!')=('[0]',$sheet-name)) + then (: it's a local reference after all :) + e:expand($tokens,$i+1,true(),$row,$col) + else let $ext := e:expand($tokens,$i+1,false(),$row,$col) return + [$ext?1,e:external($ext?2)] + else if ($t/@nr=10) then e:amalgamate($tokens,$i+1,string($t)) + else if ($r[@nr=10 and .=':']) + then (: a range, takes priority :) + e:range($tokens,$i,$ext,$row,$col) + else if ($t/@nr=8) then e:single($i,$ext,string($t)) + else if ($t/@nr=9) + then if ($ext) then (: can't expand :) e:exp1($i,'v',string($t)) + else e:tokenise(e:lookup(string($t)),$row,$col) + else (-- shouldn't ever get here --) ()"/> + </xsl:function> + + <xsl:function name="e:exp1" as="array(*)"> + <xsl:param name="i" as="xs:int"/> + <xsl:param name="name" as="xs:string"/> + <xsl:param name="val" as="xs:string"/> + <xsl:variable name="elt"> + <xsl:element name="{$name}" namespace="http://markup.co.uk/excel"> + <xsl:value-of select="$val"/> + </xsl:element> + </xsl:variable> + <xsl:sequence select="[$i+1,$elt]"/> + </xsl:function> + + <xsl:function name="e:single" as="element(*)"> + <xsl:param name="group" as="element(xf:group)"/> + <xsl:param name="row" as="xs:integer"/> + <xsl:param name="col" as="xs:integer"/> + <xsl:param name="external" as="xs:boolean"/> + <xsl:variable name="val" select="if ($group/@nr=9) then e:lookup($group) + else string($group)"/> + <xsl:choose> + <xsl:when test="count($val)>0 or not($external)"> + <xsl:sequence select="e:cr($val,$row,$col)"/> + </xsl:when> + <xsl:otherwise> + <v><xsl:value-of select="$group"/></v> + </xsl:otherwise> + </xsl:choose> + </xsl:function> + + <xsl:function name="e:range" as="element(e:r)"> + <xsl:param name="l" as="element(e:s)" required="yes"/> + <xsl:param name="r" as="element(e:s)" required="yes"/> + <r><xsl:copy-of select="$l"/><xsl:copy-of select="$r"/></r> + </xsl:function> + + <xsl:function name="e:external" as="element(e:e)"> + <xsl:param name="source" as="element(xf:group)" required="yes"/> + <xsl:param name="ref" as="element(*)" required="yes"/> + <e s="{$source}"><xsl:sequence select="$ref"/></e> + </xsl:function> +</xsl:stylesheet>