changeset 5867:e0f1dfaa821e

Add the non-ASCII digit support, now #'parse-integer can handle it. lisp/ChangeLog addition: 2015-03-08 Aidan Kehoe <kehoea@parhasard.net> * mule/digit.el: New file, specifying integer weights for those decimal digits specified by Unicode, and for the Latin characters in ASCII. * dumped-lisp.el (preloaded-file-list): Dump this file, so those weights are available at runtime. tests/ChangeLog addition: 2015-03-16 Aidan Kehoe <kehoea@parhasard.net> * automated/lisp-tests.el: Extend the tests for #'parse-integer, #'digit-char, #'digit-char-p substantially, testing the Unicode decimal digits in detail. src/ChangeLog addition: 2015-03-08 Aidan Kehoe <kehoea@parhasard.net> * data.c (Fparse_integer): Document a complication that arises when a leading minus sign has a digit weight but is nonetheles interpreted as a minus.
author Aidan Kehoe <kehoea@parhasard.net>
date Mon, 16 Mar 2015 00:28:18 +0000
parents 5ea790936de9
children da732079c58d
files lisp/ChangeLog lisp/dumped-lisp.el man/ChangeLog man/lispref/strings.texi src/ChangeLog src/data.c tests/ChangeLog tests/automated/lisp-tests.el
diffstat 8 files changed, 238 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/lisp/ChangeLog	Mon Mar 16 00:11:30 2015 +0000
+++ b/lisp/ChangeLog	Mon Mar 16 00:28:18 2015 +0000
@@ -91,6 +91,14 @@
 	* simple.el (append-message):
 	Update this to reflect a changed message-stack structure.
 
+2015-03-08  Aidan Kehoe  <kehoea@parhasard.net>
+ 
+ 	* mule/digit.el: New file, specifying integer weights for those
+ 	decimal digits specified by Unicode, and for the Latin characters
+ 	in ASCII.
+ 	* dumped-lisp.el (preloaded-file-list):
+ 	Dump this file, so those weights are available at runtime.
+ 
 2015-02-25  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* cl-extra.el (digit-char-p): Moved to data.c.
--- a/lisp/dumped-lisp.el	Mon Mar 16 00:11:30 2015 +0000
+++ b/lisp/dumped-lisp.el	Mon Mar 16 00:28:18 2015 +0000
@@ -233,7 +233,8 @@
 	 "mule/mule-msw-init-late")
 
        (when (featurep 'mule)
-	 "mule/general-late")
+	 '("mule/digit"
+           "mule/general-late"))
 
 ;;; mule-load.el ends here
 
--- a/man/ChangeLog	Mon Mar 16 00:11:30 2015 +0000
+++ b/man/ChangeLog	Mon Mar 16 00:28:18 2015 +0000
@@ -14,6 +14,13 @@
 	Add notes about pixel_to_glyph_translation and related code.
 	(pixel_to_glyph_translation): New section.	
 
+2015-03-08  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* lispref/strings.texi (Predicates for Characters):
+	Document `digit-char-p', `digit-char' here.
+	* lispref/strings.texi (String Conversion):
+	Document `parse-integer' here.
+
 2014-03-28  Jerry James  <james@xemacs.org>
 
 	* Makefile.in: Do not build texinfo files.
--- a/man/lispref/strings.texi	Mon Mar 16 00:11:30 2015 +0000
+++ b/man/lispref/strings.texi	Mon Mar 16 00:28:18 2015 +0000
@@ -297,6 +297,22 @@
 This function returns @code{t} if @var{object} is an integer or character.
 @end defun
 
+@defun digit-char-p character &optional (radix 10) radix-table
+This function returns non-nil if @var{character} represents a digit in base
+@var{radix}. The non-nil value returned is the integer value associated with
+@var{character}. @var{radix} defaults to @code{10}.
+
+@var{radix-table}, if non-nil, specifies a character table mapping from
+characters to their digit values. See the function @code{parse-integer} in
+@ref{String Conversion}.
+@end defun
+
+@defun digit-char weight &optional (radix 10) radix-table
+This function is the inverse of @code{digit-char-p}. Given an integer
+@var{weight}, it returns a character representing @var{weight} when read with
+base @var{radix}. If no such character exists, it returns @code{nil}. 
+@end defun
+
 @node Character Codes, Text Comparison, Predicates for Characters, Strings and Characters
 @section Character Codes
 
@@ -558,6 +574,40 @@
 See also the function @code{format} in @ref{Formatting Strings}.
 @end defun
 
+@defun parse-integer string &key (start 0) end (radix 10) junk-allowed radix-table)
+@cindex parse-integer
+This function returns the fixnum or bignum value represented by @var{string}
+in base @var{radix}. If @var{start} or @var{end} are specified, they describe
+a subsequence of @var{string} to examine. @xref{Sequence Basics, Sequence
+Basics, Sequences, cl}.  It skips whitespace at the beginning and end of the
+digits seen. Leading @code{?+} and @code{?-} are accepted and have the usual
+meaning; there is no provision for recognition of a leading @code{#x} or
+@code{#o} to indicate a non-default @var{radix}, and there is no special
+provision for a trailing @code{.}.
+
+If @var{junk-allowed} is non-nil, @code{parse-integer} allows non-digit,
+non-whitespace characters before or after the number to be parsed.  Otherwise,
+it errors when presented with anything other than an optional series of
+whitespace characters, followed by a non-optional series of digit characters,
+followed by an optional series of whitespace characters.
+
+@code{parse-integer} returns two values, the second of which can be accessed
+by, e.g. @xref{Multiple values, multiple-value-bind, ,}.  If
+@var{junk-allowed} is nil, the second value is always the length of that
+subsequence of @var{string} specified.  Otherwise, it is the index of the
+first non-digit trailing character encountered.
+
+Normally, the digit values for each character encountered are those returned
+by @code{digit-char-p}. Specifying @var{radix-table} allows this to be
+overridden; if it is specified, it is a character table (@pxref{Char Tables})
+mapping from characters to fixnum digit values.  If a given character has both
+a digit value and a whitespace value, the digit value overrides. However, if a
+@code{?-} both has a digit value and is the first character examined, its
+digit value is ignored, it is treated as indicating the number is negative.
+A work-around is to prepend a character having digit value zero to positive
+integers and to double the leading @code{?-} for negative integers.
+@end defun
+
 @defun string-to-number string &optional base
 @cindex string to number
 This function returns the numeric value represented by @var{string},
--- a/src/ChangeLog	Mon Mar 16 00:11:30 2015 +0000
+++ b/src/ChangeLog	Mon Mar 16 00:28:18 2015 +0000
@@ -1,3 +1,9 @@
+2015-03-08  Aidan Kehoe  <kehoea@parhasard.net>
+  
+	* data.c (Fparse_integer):
+	Document a complication that arises when a leading minus sign has
+	a digit weight but is nonetheles interpreted as a minus.
+
 2015-03-15  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* text.h: Make CHAR_CODE_LIMIT available as a #define.
--- a/src/data.c	Mon Mar 16 00:11:30 2015 +0000
+++ b/src/data.c	Mon Mar 16 00:28:18 2015 +0000
@@ -1940,6 +1940,13 @@
 possible values further, and the maximum RADIX available is always the largest
 positive value available RADIX-TABLE.
 
+If RADIX-TABLE (or `digit-fixnum-map') assigns a numeric value to `-', its
+digit value will be ignored if ?- is the first character in a string. The
+number will be treated as negative.  To work around this, double it, or assign
+another character the value zero and prefix non-negative strings with that.  A
+digit value for ?+ will be respected even if it is the first character in the
+representation of an integer.
+
 arguments: (STRING &key (START 0) end (RADIX 10) junk-allowed radix-table)
 */
        (int nargs, Lisp_Object *args))
--- a/tests/ChangeLog	Mon Mar 16 00:11:30 2015 +0000
+++ b/tests/ChangeLog	Mon Mar 16 00:28:18 2015 +0000
@@ -1,3 +1,9 @@
+2015-03-16  Aidan Kehoe  <kehoea@parhasard.net>
+
+	* automated/lisp-tests.el:
+	Extend the tests for #'parse-integer, #'digit-char, #'digit-char-p
+	substantially, testing the Unicode decimal digits in detail.
+
 2015-03-15  Aidan Kehoe  <kehoea@parhasard.net>
 
 	* automated/mule-tests.el (test-chars):
--- a/tests/automated/lisp-tests.el	Mon Mar 16 00:11:30 2015 +0000
+++ b/tests/automated/lisp-tests.el	Mon Mar 16 00:28:18 2015 +0000
@@ -3547,7 +3547,7 @@
 ;;-----------------------------------------------------
 
 (Check-Error wrong-type-argument (parse-integer 123456789))
-(Check-Error wrong-type-argument (parse-integer "123456789" :start -1))
+
 (if (featurep 'bignum)
     (progn
       (Check-Error args-out-of-range
@@ -3566,6 +3566,139 @@
              (parse-integer "123456789" :junk-allowed))
 (Check-Error invalid-keyword-argument
              (parse-integer "123456789" :no-such-keyword t))
+(Check-Error wrong-type-argument (parse-integer "123456789" :start -1))
+(Check-Error invalid-argument (parse-integer "abc"))
+(Check-Error invalid-argument (parse-integer "efz" :radix 16))
+
+(macrolet
+    ((with-digits (ascii alternate script)
+       (let ((tree-alist (list (cons 'old 'new)))
+             (text-alist (mapcar* #'cons ascii alternate)))
+         (list*
+          'progn
+          (cons
+           'progn
+           (loop for ascii-digit across ascii
+                 for non-ascii across alternate
+                 collect `(Assert (eql (digit-char-p ,non-ascii)
+                                   ,(- ascii-digit ?0))
+                           ,(concat "checking `digit-char-p', base-10, "
+                                    script))))
+          (sublis
+           tree-alist 
+           '((Assert (equal (multiple-value-list
+                                (parse-integer "	-123  "))
+                      '(-123 7)))
+             (Assert (equal (multiple-value-list
+                               (parse-integer "-3efz" :radix 16
+                                              :junk-allowed t))
+                      '(-1007 4)))
+             (Assert (equal (multiple-value-list
+                                (parse-integer "zzef9" :radix 16 :start 2))
+                      '(3833 5)))
+             (Assert (equal (multiple-value-list
+                                (parse-integer "0123456789" :radix 8
+                                               :junk-allowed t))
+                      '(342391 8)))
+             (Assert (equal (multiple-value-list
+                                (parse-integer "":junk-allowed t))
+                      '(nil 0)))
+             (Assert (equal (multiple-value-list
+                                (parse-integer "abc" :junk-allowed t))
+                      '(nil 0)))
+             (Check-Error invalid-argument (parse-integer "0123456789"
+                                            :radix 8))
+             (Check-Error invalid-argument (parse-integer "abc"))
+             (Check-Error invalid-argument (parse-integer "efz" :radix 16))
+             
+             ;; In contravention of Common Lisp, we allow both 0 and 1 as
+             ;; values for RADIX, useless as that is.
+             (Assert (equal (multiple-value-list
+                                (parse-integer "00000" :radix 1)) '(0 5))
+              "checking 1 is allowed as a value for RADIX")
+             (Assert (equal (multiple-value-list
+                                (parse-integer "" :radix 0 :junk-allowed t))
+                      '(nil 0))
+              "checking 0 is allowed as a value for RADIX"))
+             :test #'(lambda (new old)
+                       ;; This function replaces any ASCII decimal digits in
+                       ;; any string encountered in the tree with the
+                       ;; non-ASCII digits supplied in ALTERNATE.
+                       (when (and (stringp old) (find-if #'digit-char-p old))
+                         (setf (cdar tree-alist)
+                               (concatenate 'string
+                                            (sublis text-alist
+                                                    (append old nil))))
+                         t))))))
+     (repeat-parse-integer-non-ascii ()
+       (when (featurep 'mule)
+         (cons
+          'progn
+          (loop for (code-point . script)
+            in '((#x0660 . "Arabic-Indic")
+                 (#x06f0 . "Extended Arabic-Indic")
+                 (#x07c0 . "Nko")
+                 (#x0966 . "Devanagari")
+                 (#x09e6 . "Bengali")
+                 (#x0a66 . "Gurmukhi")
+                 (#x0ae6 . "Gujarati")
+                 (#x0b66 . "Oriya")
+                 (#x0be6 . "Tamil")
+                 (#x0c66 . "Telugu")
+                 (#x0ce6 . "Kannada")
+                 (#x0d66 . "Malayalam")
+                 (#x0de6 . "Sinhala Lith")
+                 (#x0e50 . "Thai")
+                 (#x0ed0 . "Lao")
+                 (#x0f20 . "Tibetan")
+                 (#x1040 . "Myanmar")
+                 (#x1090 . "Myanmar Shan")
+                 (#x17e0 . "Khmer")
+                 (#x1810 . "Mongolian")
+                 (#x1946 . "Limbu")
+                 (#x19d0 . "New Tai Lue")
+                 (#x1a80 . "Tai Tham Hora")
+                 (#x1a90 . "Tai Tham Tham")
+                 (#x1b50 . "Balinese")
+                 (#x1bb0 . "Sundanese")
+                 (#x1c40 . "Lepcha")
+                 (#x1c50 . "Ol Chiki")
+                 (#xa620 . "Vai")
+                 (#xa8d0 . "Saurashtra")
+                 (#xa900 . "Kayah Li")
+                 (#xa9d0 . "Javanese")
+                 (#xa9f0 . "Myanmar Tai Laing")
+                 (#xaa50 . "Cham")
+                 (#xabf0 . "Meetei Mayek")
+                 (#xff10 . "Fullwidth")
+                 (#x000104a0 . "Osmanya")
+                 (#x00011066 . "Brahmi")
+                 (#x000110f0 . "Sora Sompeng")
+                 (#x00011136 . "Chakma")
+                 (#x000111d0 . "Sharada")
+                 (#x000112f0 . "Khudawadi")
+                 (#x000114d0 . "Tirhuta")
+                 (#x00011650 . "Modi")
+                 (#x000116c0 . "Takri")
+                 (#x000118e0 . "Warang Citi")
+                 (#x00016a60 . "Mro")
+                 (#x00016b50 . "Pahawh Hmong")
+                 (#x0001d7ce . "Mathematical Bold")
+                 (#x0001d7d8 . "Mathematical Double-Struck")
+                 (#x0001d7e2 . "Mathematical Sans-Serif")
+                 (#x0001d7ec . "Mathematical Sans-Serif Bold")
+                 (#x0001d7f6 . "Mathematical Monospace"))
+            collect `(with-digits "0123456789"
+                      ;; All the Unicode decimal digits have contiguous code
+                      ;; point ranges as documented by the Unicode standard,
+                      ;; we can just increment.
+                      ,(concat (loop for fixnum from code-point
+                                     to (+ code-point 9)
+                                     collect (decode-char 'ucs fixnum))
+                               "")
+		      ,script))))))
+  (with-digits "0123456789" "0123456789" "ASCII")
+  (repeat-parse-integer-non-ascii))
 
 ;; Next two paragraphs of tests from GNU, thank you Leo Liu.
 (Assert (eql (digit-char-p ?3) 3))
@@ -3575,34 +3708,25 @@
 (Check-Error args-out-of-range (digit-char-p ?a 37))
 (Assert (not (digit-char-p ?a 1)))
 
-(Assert (equal (multiple-value-list (parse-integer "	-123  ")) '(-123 7)))
-(Assert (equal (multiple-value-list
-                   (parse-integer "-efz" :radix 16 :junk-allowed t))
-               '(-239 3)))
-(Assert (equal (multiple-value-list (parse-integer "zzef" :radix 16 :start 2))
-               '(239 4)))
-(Assert (equal (multiple-value-list
-                   (parse-integer "0123456789" :radix 8 :junk-allowed t))
-               '(342391 8)))
-(Assert (equal (multiple-value-list (parse-integer "" :junk-allowed t))
-               '(nil 0)))
-(Assert (equal (multiple-value-list (parse-integer "abc" :junk-allowed t))
-               '(nil 0)))
-(Check-Error invalid-argument (parse-integer "0123456789" :radix 8))
-(Check-Error invalid-argument (parse-integer "abc"))
-(Check-Error invalid-argument (parse-integer "efz" :radix 16))
+(loop for fixnum from 0 below 36
+      do (Assert (eql fixnum (digit-char-p (digit-char fixnum 36) 36))))
 
-;; We don't allow a trailing decimal point, as the Lisp reader does.
-(Check-Error invalid-argument (parse-integer "12348."))
-
-;; In contravention of Common Lisp, we allow both 0 and 1 as values for RADIX,
-;; useless as that is.
-(Assert (equal (multiple-value-list (parse-integer "00000" :radix 1)) '(0 5))
-        "checking 1 is allowed as a value for RADIX")
-(Assert (equal (multiple-value-list
-                   (parse-integer "" :radix 0 :junk-allowed t))
-               '(nil 0))
-        "checking 0 is allowed as a value for RADIX")
+(let ((max -1)
+      (min most-positive-fixnum)
+      (max-char nil) (min-char nil))
+  (map-char-table #'(lambda (key value)
+                      (if (> value max)
+                          (setf max value
+                                max-char key))
+                      (if (< value min)
+                          (setf min value
+                                min-char key))
+                      nil)
+                  digit-fixnum-map)
+  (Assert (>= 35 max) "checking base 36 is supported, `digit-fixnum-map'")
+  (Assert (<= min 2) "checking base two supported, `digit-fixnum-map'")
+  (Assert (eql (digit-char-p max-char (1+ max)) max))
+  (Assert (eql (digit-char-p min-char (1+ min)) min))
 
 (let ((binary-table
        (copy-char-table #s(char-table :type generic :default -1 :data ()))))