changeset 23:bfa38afaea63

change to default ns
author Henry S. Thompson <ht@markup.co.uk>
date Thu, 06 Apr 2017 16:47:53 +0100
parents ca98c74a7cb1
children 87e0d620deea
files notes.txt refs.xsl
diffstat 2 files changed, 83 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/notes.txt	Wed Apr 05 11:57:00 2017 +0100
+++ b/notes.txt	Thu Apr 06 16:47:53 2017 +0100
@@ -45,10 +45,74 @@
 to wait for ascii.xsl or html.xsl.  But only copy type in in rect if
 there was content before.
 -----------
+Using attributes to hold space-separated lists is risky, as in
+refs.xsl output, is risky!
+-----------
 Not handling variables as references.  Not catching external
 references to variables.  Not catching naked [n]! as external
 references.
  Fixed, but not dereferenced vars
  The definition table is in workbook.xml definedNames/definedName[@name=$name]/.
   Sheet name to filename mapping for locals is in workbook.xml sheets/sheet[@name=$sname]/@sheetId
- 
+-----------
+Switch to default namespace in order to reduce size and improve readability
+-----------
+Should put another step after refs.xsl to compute a map from
+distinct-values of all targets to all the cells which use them
+(likewise ranges).  That really does mean we should move to elts for
+each ref or range, since at this point we want to compute vector
+representation as well, so we can identify projections
+
+Slightly irritating that we'll have to serialise this as XML and then
+re-build it later...
+-----------
+ Overgenerating in kenneth_lay__19506: e.g. <e:ref c="E9" er="[1]!'.SPX' '.SPX'!"/>
+      from <f>[1]!'.SPX'</f>
+ Hmm.  This cell displays in Excel as REUTERS|IDN!.SPX
+       The indirections work as follows:
+       in workbook.xml:
+        <externalReferences>
+         <externalReference r:id="rId3"/>
+         <externalReference r:id="rId4"/>
+        </externalReferences>
+       in _rels/workbook.xml.rels
+        <Relationship Id="rId3" Target="externalLinks/externalLink1.xml" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/externalLink"/>
+       in externalLinks/externalLink1.xml
+        <ddeLink ddeService="REUTER" ddeTopic="IDN"...
+         <ddeItems>
+	  ...
+	  <ddeItem advise="1" name=".SPX">
+	  <values>
+	  <value>
+	  <val>1264.96</val>
+	  </value>
+	  </values>
+	  </ddeItem>
+       Whew!
+----------
+Tried the largest sheet from the largest .xlsx I could find:
+  fuse1k/'benjamin_rogers__1002__NYISO Price Information version 2'.xlsx
+   -rw-r--r-- 1 ht None  6273325 Apr  3 16:22 '../benjamin_rogers__1002__NYISO Price Information version 2.xlsx'
+   -rw-r--r-- 1 ht None 23221149 Jan  1  1980  xl/worksheets/sheet3.xml
+
+  > lxcount xl/worksheets/sheet3.xml | sort -k2nr
+  *Total* 1230217
+  c       596032
+  v       595876
+  f       19201
+  row     18985
+  col     106
+
+    <dimension ref="A1:DY18985"/>
+
+Blew java out of the water :-(
+  java.lang.OutOfMemoryError: Java heap space
+
+Need to try again with more memory, if I remember how...
+
+The raw result is going to have 18985 x 102 == 2 million cells ==
+(assuming average cell size of 30 bytes and row overhead of 20 (*
+18985 (+ 20 (* 102 30))) 58,473,800 bytes, which is big but tolerable...
+----------------
+Back to ranges - 
+
--- a/refs.xsl	Wed Apr 05 11:57:00 2017 +0100
+++ b/refs.xsl	Thu Apr 06 16:47:53 2017 +0100
@@ -1,11 +1,11 @@
 <?xml version='1.0'?>
 <!DOCTYPE doc SYSTEM "../../../lib/xml/xsl.dtd" >
-<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s" xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xf="http://www.w3.org/2005/xpath-functions">
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="3.0" xmlns:s="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:e="http://markup.co.uk/excel" exclude-result-prefixes="xs s e xf" xmlns="http://markup.co.uk/excel" xmlns:xf="http://www.w3.org/2005/xpath-functions">
   <xsl:variable name="pat1">("[^"]*")|(\{[^}]+})|(,)|([^=\-+*/();:,.$&lt;>^!]+(?:\.[^=\-+*/();:,.$&lt;>^!]+)*\()|([)])|(^=|\()|((?:'[^']+')|(?:\[[0-9]+\][^!]*))|(\$?[A-Z]+\$?[0-9]+)|([a-zA-Z_\\][a-zA-Z0-9._]*)|(.)</xsl:variable>
  <xsl:param name="pat" select="$pat1"/>
 
  <xsl:template match="/">
-  <e:refs><xsl:apply-templates select="//s:c"/></e:refs>
+  <refs><xsl:apply-templates select="//s:c"/></refs>
  </xsl:template>
  
  <xsl:template match="s:c[s:f]">
@@ -16,9 +16,14 @@
   <xsl:if test="$tokens[@nr=(7,8,9)]">
    <xsl:variable name="n" select="count($tokens)"/>
    <xsl:variable name="singles" select="for $i in (1 to $n)
-       return if ($tokens[$i][@nr=(8,9)] and not($tokens[$i - 1][@nr=10 and .=(':','!')]) and not($tokens[$i + 1][@nr=10 and .=':']))
+       return if ($tokens[$i][@nr=(8,9)] and
+                  not($tokens[$i - 1][@nr=10 and
+                      .=(':','!')]) and
+                  not($tokens[$i + 1][@nr=10 and .=':']))
             then translate($tokens[$i],'$','')
             else ()"/>
+   <!-- Note that we don't bother to treat external ranges as ranges,
+          since we're not going to try to detect cross-document refs -->
    <xsl:variable name="ranges" select="for $i in (1 to count($tokens))
           return if ($tokens[$i][@nr=10 and .=':' and
                                 not($i gt 2 and
@@ -30,11 +35,19 @@
             then 
               let $bit := concat($tokens[$i],'!',
                                  translate($tokens[$i + 2],'$',''))
-              return if ((($i+3) le $n) and $tokens[$i + 3][@nr=10 and .=':'])
+              return if ((($i+3) le $n) and
+                         $tokens[$i + 3][@nr=10 and .=':'])
                 then concat($bit,':',translate($tokens[$i + 4],'$',''))
                 else $bit
             else ()"/>
-   <e:ref c="{@r}" r="{$singles}" rr="{$ranges}" er="{$externals}"/></xsl:if>
+   <ref c="{@r}">
+    <!-- Assumes that space doesn't occur in variable names
+         Might occur in external names (who knows!) but I'm assuming we're
+           never going to split the value of @er back out... -->
+    <xsl:if test="count($singles)>0"><xsl:attribute name="r"><xsl:value-of select="distinct-values($singles)"/></xsl:attribute></xsl:if>
+    <xsl:if test="count($ranges)>0"><xsl:attribute name="rr"><xsl:value-of select="distinct-values($ranges)"/></xsl:attribute></xsl:if>
+    <xsl:if test="count($externals)>0"><xsl:attribute name="er"><xsl:value-of select="distinct-values($externals)"/></xsl:attribute></xsl:if>
+   </ref></xsl:if>
  </xsl:template>
  
  <xsl:template match="s:c"/>