Mercurial > hg > xemacs-beta
diff src/unicode.c @ 5776:65d65b52d608
Pass character count from coding systems to buffer insertion code.
src/ChangeLog addition:
2014-01-16 Aidan Kehoe <kehoea@parhasard.net>
Pass character count information from the no-conversion and
unicode coding systems to the buffer insertion code, making
#'find-file on large buffers a little snappier (if
ERROR_CHECK_TEXT is not defined).
* file-coding.c:
* file-coding.c (coding_character_tell): New.
* file-coding.c (conversion_coding_stream_description): New.
* file-coding.c (no_conversion_convert):
Update characters_seen when decoding.
* file-coding.c (no_conversion_character_tell): New.
* file-coding.c (lstream_type_create_file_coding): Create the
no_conversion type with data.
* file-coding.c (coding_system_type_create):
Make the character_tell method available here.
* file-coding.h:
* file-coding.h (struct coding_system_methods):
Add a new character_tell() method, passing charcount information
from the coding systems to the buffer code, avoiding duplicate
bytecount-to-charcount work especially with large buffers.
* fileio.c (Finsert_file_contents_internal):
Update this to pass charcount information to
buffer_insert_string_1(), if that is available from the lstream code.
* insdel.c:
* insdel.c (buffer_insert_string_1):
Add a new CCLEN argument, giving the character count of the string
to insert. It can be -1 to indicate that te function should work
it out itself using bytecount_to_charcount(), as it used to.
* insdel.c (buffer_insert_raw_string_1):
* insdel.c (buffer_insert_lisp_string_1):
* insdel.c (buffer_insert_ascstring_1):
* insdel.c (buffer_insert_emacs_char_1):
* insdel.c (buffer_insert_from_buffer_1):
* insdel.c (buffer_replace_char):
Update these functions to use the new calling convention.
* insdel.h:
* insdel.h (buffer_insert_string):
Update this header to reflect the new buffer_insert_string_1()
argument.
* lstream.c (Lstream_character_tell): New.
Return the number of characters *read* and seen by the consumer so
far, taking into account the unget buffer, and buffered reading.
* lstream.c (Lstream_unread):
Update unget_character_count here as appropriate.
* lstream.c (Lstream_rewind):
Reset unget_character_count here too.
* lstream.h:
* lstream.h (struct lstream):
Provide the character_tell method, add a new field,
unget_character_count, giving the number of characters ever passed
to Lstream_unread().
Declare Lstream_character_tell().
Make Lstream_ungetc(), which happens to be unused, an inline
function rather than a macro, in the course of updating it to
modify unget_character_count.
* print.c (output_string):
Use the new argument to buffer_insert_string_1().
* tests.c:
* tests.c (Ftest_character_tell):
New test function.
* tests.c (syms_of_tests):
Make it available.
* unicode.c:
* unicode.c (struct unicode_coding_stream):
* unicode.c (unicode_character_tell):
New method.
* unicode.c (unicode_convert):
Update the character counter as appropriate.
* unicode.c (coding_system_type_create_unicode):
Make the character_tell method available.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 16 Jan 2014 16:27:52 +0000 |
parents | 3192994c49ca |
children | 1b2fdcc3cc5c |
line wrap: on
line diff
--- a/src/unicode.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/unicode.c Thu Jan 16 16:27:52 2014 +0000 @@ -1707,6 +1707,7 @@ unsigned char counter; unsigned char indicated_length; int seen_char; + Charcount characters_seen; /* encode */ Lisp_Object current_charset; int current_char_boundary; @@ -1988,6 +1989,17 @@ write_error_characters_as_such); } +static Charcount +unicode_character_tell (struct coding_stream *str) +{ + if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0) + { + return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen; + } + + return -1; +} + static Bytecount unicode_convert (struct coding_stream *str, const UExtbyte *src, unsigned_char_dynarr *dst, Bytecount n) @@ -2006,6 +2018,7 @@ unsigned char counter = data->counter; unsigned char indicated_length = data->indicated_length; + Charcount characters_seen = data->characters_seen; while (n--) { @@ -2020,12 +2033,15 @@ { /* ASCII. */ decode_unicode_char (c, dst, data, ignore_bom); + characters_seen++; } else if (0 == (c & 0x40)) { /* Highest bit set, second highest not--there's something wrong. */ DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + /* This is a character in the buffer. */ + characters_seen++; } else if (0 == (c & 0x20)) { @@ -2050,7 +2066,7 @@ /* We don't supports lengths longer than 4 in external-format data. */ DECODE_ERROR_OCTET (c, dst, data, ignore_bom); - + characters_seen++; } } else @@ -2061,15 +2077,20 @@ indicate_invalid_utf_8(indicated_length, counter, ch, dst, data, ignore_bom); + /* These are characters our receiver will see, not + actual characters we've seen in the input. */ + characters_seen += (indicated_length - counter); if (c & 0x80) { DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + characters_seen++; } else { /* The character just read is ASCII. Treat it as such. */ decode_unicode_char (c, dst, data, ignore_bom); + characters_seen++; } ch = 0; counter = 0; @@ -2092,10 +2113,12 @@ counter, ch, dst, data, ignore_bom); + characters_seen += (indicated_length - counter); } else { decode_unicode_char (ch, dst, data, ignore_bom); + characters_seen++; } ch = 0; } @@ -2242,6 +2265,7 @@ indicate_invalid_utf_8(indicated_length, counter, ch, dst, data, ignore_bom); + characters_seen += (indicated_length - counter); break; case UNICODE_UTF_16: @@ -2295,6 +2319,7 @@ data->counter = counter; data->indicated_length = indicated_length; + data->characters_seen = characters_seen; } else { @@ -3177,6 +3202,8 @@ CODING_SYSTEM_HAS_METHOD (unicode, putprop); CODING_SYSTEM_HAS_METHOD (unicode, getprop); + CODING_SYSTEM_HAS_METHOD (unicode, character_tell); + INITIALIZE_DETECTOR (utf_8); DETECTOR_HAS_METHOD (utf_8, detect); INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);