diff src/unicode.c @ 5776:65d65b52d608

Pass character count from coding systems to buffer insertion code. src/ChangeLog addition: 2014-01-16 Aidan Kehoe <kehoea@parhasard.net> Pass character count information from the no-conversion and unicode coding systems to the buffer insertion code, making #'find-file on large buffers a little snappier (if ERROR_CHECK_TEXT is not defined). * file-coding.c: * file-coding.c (coding_character_tell): New. * file-coding.c (conversion_coding_stream_description): New. * file-coding.c (no_conversion_convert): Update characters_seen when decoding. * file-coding.c (no_conversion_character_tell): New. * file-coding.c (lstream_type_create_file_coding): Create the no_conversion type with data. * file-coding.c (coding_system_type_create): Make the character_tell method available here. * file-coding.h: * file-coding.h (struct coding_system_methods): Add a new character_tell() method, passing charcount information from the coding systems to the buffer code, avoiding duplicate bytecount-to-charcount work especially with large buffers. * fileio.c (Finsert_file_contents_internal): Update this to pass charcount information to buffer_insert_string_1(), if that is available from the lstream code. * insdel.c: * insdel.c (buffer_insert_string_1): Add a new CCLEN argument, giving the character count of the string to insert. It can be -1 to indicate that te function should work it out itself using bytecount_to_charcount(), as it used to. * insdel.c (buffer_insert_raw_string_1): * insdel.c (buffer_insert_lisp_string_1): * insdel.c (buffer_insert_ascstring_1): * insdel.c (buffer_insert_emacs_char_1): * insdel.c (buffer_insert_from_buffer_1): * insdel.c (buffer_replace_char): Update these functions to use the new calling convention. * insdel.h: * insdel.h (buffer_insert_string): Update this header to reflect the new buffer_insert_string_1() argument. * lstream.c (Lstream_character_tell): New. Return the number of characters *read* and seen by the consumer so far, taking into account the unget buffer, and buffered reading. * lstream.c (Lstream_unread): Update unget_character_count here as appropriate. * lstream.c (Lstream_rewind): Reset unget_character_count here too. * lstream.h: * lstream.h (struct lstream): Provide the character_tell method, add a new field, unget_character_count, giving the number of characters ever passed to Lstream_unread(). Declare Lstream_character_tell(). Make Lstream_ungetc(), which happens to be unused, an inline function rather than a macro, in the course of updating it to modify unget_character_count. * print.c (output_string): Use the new argument to buffer_insert_string_1(). * tests.c: * tests.c (Ftest_character_tell): New test function. * tests.c (syms_of_tests): Make it available. * unicode.c: * unicode.c (struct unicode_coding_stream): * unicode.c (unicode_character_tell): New method. * unicode.c (unicode_convert): Update the character counter as appropriate. * unicode.c (coding_system_type_create_unicode): Make the character_tell method available.
author Aidan Kehoe <kehoea@parhasard.net>
date Thu, 16 Jan 2014 16:27:52 +0000
parents 3192994c49ca
children 1b2fdcc3cc5c
line wrap: on
line diff
--- a/src/unicode.c	Sun Dec 22 10:36:33 2013 +0000
+++ b/src/unicode.c	Thu Jan 16 16:27:52 2014 +0000
@@ -1707,6 +1707,7 @@
   unsigned char counter;
   unsigned char indicated_length;
   int seen_char;
+  Charcount characters_seen;
   /* encode */
   Lisp_Object current_charset;
   int current_char_boundary;
@@ -1988,6 +1989,17 @@
                          write_error_characters_as_such);
 }
 
+static Charcount
+unicode_character_tell (struct coding_stream *str)
+{
+  if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0)
+    {
+      return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen;
+    }
+
+  return -1;
+}
+
 static Bytecount
 unicode_convert (struct coding_stream *str, const UExtbyte *src,
 		 unsigned_char_dynarr *dst, Bytecount n)
@@ -2006,6 +2018,7 @@
       unsigned char counter = data->counter;
       unsigned char indicated_length
         = data->indicated_length;
+      Charcount characters_seen = data->characters_seen;
 
       while (n--)
 	{
@@ -2020,12 +2033,15 @@
                     {
                       /* ASCII. */
                       decode_unicode_char (c, dst, data, ignore_bom);
+                      characters_seen++;
                     }
                   else if (0 == (c & 0x40))
                     {
                       /* Highest bit set, second highest not--there's
                          something wrong. */
                       DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                      /* This is a character in the buffer. */
+                      characters_seen++;
                     }
                   else if (0 == (c & 0x20))
                     {
@@ -2050,7 +2066,7 @@
                       /* We don't supports lengths longer than 4 in
                          external-format data. */
                       DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
-
+                      characters_seen++;
                     }
                 }
               else
@@ -2061,15 +2077,20 @@
                       indicate_invalid_utf_8(indicated_length, 
                                              counter, 
                                              ch, dst, data, ignore_bom);
+                      /* These are characters our receiver will see, not
+                         actual characters we've seen in the input. */
+                      characters_seen += (indicated_length - counter);
                       if (c & 0x80)
                         {
                           DECODE_ERROR_OCTET (c, dst, data, ignore_bom);
+                          characters_seen++;
                         }
                       else
                         {
                           /* The character just read is ASCII. Treat it as
                              such.  */
                           decode_unicode_char (c, dst, data, ignore_bom);
+                          characters_seen++;
                         }
                       ch = 0;
                       counter = 0;
@@ -2092,10 +2113,12 @@
                                                      counter, 
                                                      ch, dst, data,
                                                      ignore_bom);
+                              characters_seen += (indicated_length - counter);
                             }
                           else
                             {
                               decode_unicode_char (ch, dst, data, ignore_bom);
+                              characters_seen++;
                             }
                           ch = 0;
                         }
@@ -2242,6 +2265,7 @@
               indicate_invalid_utf_8(indicated_length, 
                                      counter, ch, dst, data, 
                                      ignore_bom);
+              characters_seen += (indicated_length - counter);
               break;
 
             case UNICODE_UTF_16:
@@ -2295,6 +2319,7 @@
 
       data->counter = counter;
       data->indicated_length = indicated_length;
+      data->characters_seen = characters_seen;
     }
   else
     {
@@ -3177,6 +3202,8 @@
   CODING_SYSTEM_HAS_METHOD (unicode, putprop);
   CODING_SYSTEM_HAS_METHOD (unicode, getprop);
 
+  CODING_SYSTEM_HAS_METHOD (unicode, character_tell);
+
   INITIALIZE_DETECTOR (utf_8);
   DETECTOR_HAS_METHOD (utf_8, detect);
   INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);