Mercurial > hg > xemacs-beta
diff src/lstream.c @ 5776:65d65b52d608
Pass character count from coding systems to buffer insertion code.
src/ChangeLog addition:
2014-01-16 Aidan Kehoe <kehoea@parhasard.net>
Pass character count information from the no-conversion and
unicode coding systems to the buffer insertion code, making
#'find-file on large buffers a little snappier (if
ERROR_CHECK_TEXT is not defined).
* file-coding.c:
* file-coding.c (coding_character_tell): New.
* file-coding.c (conversion_coding_stream_description): New.
* file-coding.c (no_conversion_convert):
Update characters_seen when decoding.
* file-coding.c (no_conversion_character_tell): New.
* file-coding.c (lstream_type_create_file_coding): Create the
no_conversion type with data.
* file-coding.c (coding_system_type_create):
Make the character_tell method available here.
* file-coding.h:
* file-coding.h (struct coding_system_methods):
Add a new character_tell() method, passing charcount information
from the coding systems to the buffer code, avoiding duplicate
bytecount-to-charcount work especially with large buffers.
* fileio.c (Finsert_file_contents_internal):
Update this to pass charcount information to
buffer_insert_string_1(), if that is available from the lstream code.
* insdel.c:
* insdel.c (buffer_insert_string_1):
Add a new CCLEN argument, giving the character count of the string
to insert. It can be -1 to indicate that te function should work
it out itself using bytecount_to_charcount(), as it used to.
* insdel.c (buffer_insert_raw_string_1):
* insdel.c (buffer_insert_lisp_string_1):
* insdel.c (buffer_insert_ascstring_1):
* insdel.c (buffer_insert_emacs_char_1):
* insdel.c (buffer_insert_from_buffer_1):
* insdel.c (buffer_replace_char):
Update these functions to use the new calling convention.
* insdel.h:
* insdel.h (buffer_insert_string):
Update this header to reflect the new buffer_insert_string_1()
argument.
* lstream.c (Lstream_character_tell): New.
Return the number of characters *read* and seen by the consumer so
far, taking into account the unget buffer, and buffered reading.
* lstream.c (Lstream_unread):
Update unget_character_count here as appropriate.
* lstream.c (Lstream_rewind):
Reset unget_character_count here too.
* lstream.h:
* lstream.h (struct lstream):
Provide the character_tell method, add a new field,
unget_character_count, giving the number of characters ever passed
to Lstream_unread().
Declare Lstream_character_tell().
Make Lstream_ungetc(), which happens to be unused, an inline
function rather than a macro, in the course of updating it to
modify unget_character_count.
* print.c (output_string):
Use the new argument to buffer_insert_string_1().
* tests.c:
* tests.c (Ftest_character_tell):
New test function.
* tests.c (syms_of_tests):
Make it available.
* unicode.c:
* unicode.c (struct unicode_coding_stream):
* unicode.c (unicode_character_tell):
New method.
* unicode.c (unicode_convert):
Update the character counter as appropriate.
* unicode.c (coding_system_type_create_unicode):
Make the character_tell method available.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 16 Jan 2014 16:27:52 +0000 |
parents | 2dbefd79b3d3 |
children | 0cb4f494a548 |
line wrap: on
line diff
--- a/src/lstream.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/lstream.c Thu Jan 16 16:27:52 2014 +0000 @@ -735,6 +735,134 @@ return Lstream_read_1 (lstr, data, size, 0); } +Charcount +Lstream_character_tell (Lstream *lstr) +{ + Charcount ctell = lstr->imp->character_tell ? + lstr->imp->character_tell (lstr) : -1; + + if (ctell >= 0) + { + /* Our implementation's character tell code doesn't know about the + unget buffer, update its figure to reflect it. */ + ctell += lstr->unget_character_count; + + if (lstr->unget_buffer_ind > 0) + { + /* The character count should not include those characters + currently *in* the unget buffer, subtract that count. */ + Ibyte *ungot, *ungot_ptr; + Bytecount ii = lstr->unget_buffer_ind, impartial, sevenflen; + + ungot_ptr = ungot + = alloca_ibytes (lstr->unget_buffer_ind) + MAX_ICHAR_LEN; + + /* Make sure the string starts with a valid ibyteptr, otherwise + validate_ibyte_string_backward could run off the beginning. */ + sevenflen = set_itext_ichar (ungot, (Ichar) 0x7f); + ungot_ptr += sevenflen; + + /* Internal format data, but in reverse order. There's not + actually a need to alloca here, we could work out the character + count directly from the reversed bytes, but the alloca approach + is more robust to changes in our internal format, and the unget + buffer is not going to blow the stack. */ + while (ii > 0) + { + *ungot_ptr++ = lstr->unget_buffer[--ii]; + } + + impartial + = validate_ibyte_string_backward (ungot, ungot_ptr - ungot); + + /* Move past the character we added. */ + impartial -= sevenflen; + INC_IBYTEPTR (ungot); + + if (impartial > 0 && !valid_ibyteptr_p (ungot)) + { + Ibyte *newstart = ungot, *limit = ungot + impartial; + /* Our consumer has the start of a partial character, we + have the rest. */ + + while (!valid_ibyteptr_p (newstart) && newstart < limit) + { + newstart++, impartial--; + } + + /* Remove this character from the count, since the + end-consumer hasn't seen the full character. */ + ctell--; + ungot = newstart; + } + else if (valid_ibyteptr_p (ungot) + && rep_bytes_by_first_byte (*ungot) > impartial) + { + /* Rest of a partial character has yet to be read, its first + octet has probably been unread by Lstream_read_1(). We + included it in the accounting in Lstream_unread(), adjust + the figure here appropriately. */ + ctell--; + } + + /* bytecount_to_charcount will throw an assertion failure if we're + not at the start of a character. */ + text_checking_assert (impartial == 0 || valid_ibyteptr_p (ungot)); + + /* The character length of this text is included in + unget_character_count; if the bytes are still in the unget + buffer, then our consumers haven't seen them, and so the + character tell figure shouldn't reflect them. Subtract it from + the total. */ + ctell -= bytecount_to_charcount (ungot, impartial); + } + + if (lstr->in_buffer_ind < lstr->in_buffer_current) + { + Ibyte *inbuf = lstr->in_buffer + lstr->in_buffer_ind; + Bytecount partial = lstr->in_buffer_current - lstr->in_buffer_ind, + impartial; + + if (!valid_ibyteptr_p (inbuf)) + { + Ibyte *newstart = inbuf; + Ibyte *limit = lstr->in_buffer + lstr->in_buffer_current; + /* Our consumer has the start of a partial character, we + have the rest. */ + + while (newstart < limit && !valid_ibyteptr_p (newstart)) + { + newstart++; + } + + /* Remove this character from the count, since the + end-consumer hasn't seen the full character. */ + ctell--; + inbuf = newstart; + partial = limit - newstart; + } + + if (valid_ibyteptr_p (inbuf)) + { + /* There's at least one valid starting char in the string, + validate_ibyte_string_backward won't run off the + begining. */ + impartial = + validate_ibyte_string_backward (inbuf, partial); + } + else + { + impartial = 0; + } + + ctell -= bytecount_to_charcount (inbuf, impartial); + } + + text_checking_assert (ctell >= 0); + } + + return ctell; +} /* Push back SIZE bytes of DATA onto the input queue. The next call to Lstream_read() with the same size will read the same bytes back. @@ -755,7 +883,12 @@ /* Bytes have to go on in reverse order -- they are reversed again when read back. */ while (size--) - lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size]; + { + lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size]; + /* If we see a valid first byte, that is the last octet in a + character, so increase the count of ungot characters. */ + lstr->unget_character_count += valid_ibyteptr_p (p + size); + } } /* Rewind the stream to the beginning. */ @@ -768,6 +901,7 @@ if (Lstream_flush (lstr) < 0) return -1; lstr->byte_count = 0; + lstr->unget_character_count = 0; return (lstr->imp->rewinder) (lstr); }