xemacs-beta: src/lstream.c comparison

comparison src/lstream.c @ 5776:65d65b52d608

Pass character count from coding systems to buffer insertion code. src/ChangeLog addition: 2014-01-16 Aidan Kehoe <kehoea@parhasard.net> Pass character count information from the no-conversion and unicode coding systems to the buffer insertion code, making #'find-file on large buffers a little snappier (if ERROR_CHECK_TEXT is not defined). * file-coding.c: * file-coding.c (coding_character_tell): New. * file-coding.c (conversion_coding_stream_description): New. * file-coding.c (no_conversion_convert): Update characters_seen when decoding. * file-coding.c (no_conversion_character_tell): New. * file-coding.c (lstream_type_create_file_coding): Create the no_conversion type with data. * file-coding.c (coding_system_type_create): Make the character_tell method available here. * file-coding.h: * file-coding.h (struct coding_system_methods): Add a new character_tell() method, passing charcount information from the coding systems to the buffer code, avoiding duplicate bytecount-to-charcount work especially with large buffers. * fileio.c (Finsert_file_contents_internal): Update this to pass charcount information to buffer_insert_string_1(), if that is available from the lstream code. * insdel.c: * insdel.c (buffer_insert_string_1): Add a new CCLEN argument, giving the character count of the string to insert. It can be -1 to indicate that te function should work it out itself using bytecount_to_charcount(), as it used to. * insdel.c (buffer_insert_raw_string_1): * insdel.c (buffer_insert_lisp_string_1): * insdel.c (buffer_insert_ascstring_1): * insdel.c (buffer_insert_emacs_char_1): * insdel.c (buffer_insert_from_buffer_1): * insdel.c (buffer_replace_char): Update these functions to use the new calling convention. * insdel.h: * insdel.h (buffer_insert_string): Update this header to reflect the new buffer_insert_string_1() argument. * lstream.c (Lstream_character_tell): New. Return the number of characters *read* and seen by the consumer so far, taking into account the unget buffer, and buffered reading. * lstream.c (Lstream_unread): Update unget_character_count here as appropriate. * lstream.c (Lstream_rewind): Reset unget_character_count here too. * lstream.h: * lstream.h (struct lstream): Provide the character_tell method, add a new field, unget_character_count, giving the number of characters ever passed to Lstream_unread(). Declare Lstream_character_tell(). Make Lstream_ungetc(), which happens to be unused, an inline function rather than a macro, in the course of updating it to modify unget_character_count. * print.c (output_string): Use the new argument to buffer_insert_string_1(). * tests.c: * tests.c (Ftest_character_tell): New test function. * tests.c (syms_of_tests): Make it available. * unicode.c: * unicode.c (struct unicode_coding_stream): * unicode.c (unicode_character_tell): New method. * unicode.c (unicode_convert): Update the character counter as appropriate. * unicode.c (coding_system_type_create_unicode): Make the character_tell method available.

author	Aidan Kehoe <kehoea@parhasard.net>
date	Thu, 16 Jan 2014 16:27:52 +0000
parents	2dbefd79b3d3
children	0cb4f494a548

comparison

equal deleted inserted replaced

-:4004c3266c09
+:65d65b52d608
 Lstream_read (Lstream *lstr, void *data, Bytecount size)
 {
 return Lstream_read_1 (lstr, data, size, 0);
 }
+Charcount
+Lstream_character_tell (Lstream *lstr)
+{
+Charcount ctell = lstr->imp->character_tell ?
+lstr->imp->character_tell (lstr) : -1;
+if (ctell >= 0)
+{
+/* Our implementation's character tell code doesn't know about the
+unget buffer, update its figure to reflect it. */
+ctell += lstr->unget_character_count;
+if (lstr->unget_buffer_ind > 0)
+{
+/* The character count should not include those characters
+currently *in* the unget buffer, subtract that count.  */
+Ibyte *ungot, *ungot_ptr;
+Bytecount ii = lstr->unget_buffer_ind, impartial, sevenflen;
+ungot_ptr = ungot
+= alloca_ibytes (lstr->unget_buffer_ind) + MAX_ICHAR_LEN;
+/* Make sure the string starts with a valid ibyteptr, otherwise
+validate_ibyte_string_backward could run off the beginning. */
+sevenflen = set_itext_ichar (ungot, (Ichar) 0x7f);
+ungot_ptr += sevenflen;
+/* Internal format data, but in reverse order. There's not
+actually a need to alloca here, we could work out the character
+count directly from the reversed bytes, but the alloca approach
+is more robust to changes in our internal format, and the unget
+buffer is not going to blow the stack. */
+while (ii > 0)
+{
+*ungot_ptr++ = lstr->unget_buffer[--ii];
+}
+impartial
+= validate_ibyte_string_backward (ungot, ungot_ptr - ungot);
+/* Move past the character we added. */
+impartial -= sevenflen;
+INC_IBYTEPTR (ungot);
+if (impartial > 0 && !valid_ibyteptr_p (ungot))
+{
+Ibyte *newstart = ungot, *limit = ungot + impartial;
+/* Our consumer has the start of a partial character, we
+have the rest. */
+while (!valid_ibyteptr_p (newstart) && newstart < limit)
+{
+newstart++, impartial--;
+}
+/* Remove this character from the count, since the
+end-consumer hasn't seen the full character. */
+ctell--;
+ungot = newstart;
+}
+else if (valid_ibyteptr_p (ungot)
+&& rep_bytes_by_first_byte (*ungot) > impartial)
+{
+/* Rest of a partial character has yet to be read, its first
+octet has probably been unread by Lstream_read_1(). We
+included it in the accounting in Lstream_unread(), adjust
+the figure here appropriately. */
+ctell--;
+}
+/* bytecount_to_charcount will throw an assertion failure if we're
+not at the start of a character. */
+text_checking_assert (impartial == 0 || valid_ibyteptr_p (ungot));
+/* The character length of this text is included in
+unget_character_count; if the bytes are still in the unget
+buffer, then our consumers haven't seen them, and so the
+character tell figure shouldn't reflect them. Subtract it from
+the total.  */
+ctell -= bytecount_to_charcount (ungot, impartial);
+}
+if (lstr->in_buffer_ind < lstr->in_buffer_current)
+{
+Ibyte *inbuf = lstr->in_buffer + lstr->in_buffer_ind;
+Bytecount partial = lstr->in_buffer_current - lstr->in_buffer_ind,
+impartial;
+if (!valid_ibyteptr_p (inbuf))
+{
+Ibyte *newstart = inbuf;
+Ibyte *limit = lstr->in_buffer + lstr->in_buffer_current;
+/* Our consumer has the start of a partial character, we
+have the rest. */
+while (newstart < limit && !valid_ibyteptr_p (newstart))
+{
+newstart++;
+}
+/* Remove this character from the count, since the
+end-consumer hasn't seen the full character. */
+ctell--;
+inbuf = newstart;
+partial = limit - newstart;
+}
+if (valid_ibyteptr_p (inbuf))
+{
+/* There's at least one valid starting char in the string,
+validate_ibyte_string_backward won't run off the
+begining. */
+impartial =
+validate_ibyte_string_backward (inbuf, partial);
+}
+else
+{
+impartial = 0;
+}
+ctell -= bytecount_to_charcount (inbuf, impartial);
+}
+text_checking_assert (ctell >= 0);
+}
+return ctell;
+}
 /* Push back SIZE bytes of DATA onto the input queue.  The next call
 to Lstream_read() with the same size will read the same bytes back.
 Note that this will be the case even if there is other pending
 unread data. */
 lstr->byte_count -= size;
 /* Bytes have to go on in reverse order -- they are reversed
 again when read back. */
 while (size--)
-lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size];
+{
+lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size];
+/* If we see a valid first byte, that is the last octet in a
+character, so increase the count of ungot characters. */
+lstr->unget_character_count += valid_ibyteptr_p (p + size);
+}
 }
 /* Rewind the stream to the beginning. */
 int
 if (!lstr->imp->rewinder)
 Lstream_internal_error ("lstream has no rewinder", lstr);
 if (Lstream_flush (lstr) < 0)
 return -1;
 lstr->byte_count = 0;
+lstr->unget_character_count = 0;
 return (lstr->imp->rewinder) (lstr);
 }
 int
 Lstream_seekable_p (Lstream *lstr)

Mercurial > hg > xemacs-beta

comparison src/lstream.c @ 5776:65d65b52d608