Mercurial > hg > xemacs-beta
changeset 5776:65d65b52d608
Pass character count from coding systems to buffer insertion code.
src/ChangeLog addition:
2014-01-16 Aidan Kehoe <kehoea@parhasard.net>
Pass character count information from the no-conversion and
unicode coding systems to the buffer insertion code, making
#'find-file on large buffers a little snappier (if
ERROR_CHECK_TEXT is not defined).
* file-coding.c:
* file-coding.c (coding_character_tell): New.
* file-coding.c (conversion_coding_stream_description): New.
* file-coding.c (no_conversion_convert):
Update characters_seen when decoding.
* file-coding.c (no_conversion_character_tell): New.
* file-coding.c (lstream_type_create_file_coding): Create the
no_conversion type with data.
* file-coding.c (coding_system_type_create):
Make the character_tell method available here.
* file-coding.h:
* file-coding.h (struct coding_system_methods):
Add a new character_tell() method, passing charcount information
from the coding systems to the buffer code, avoiding duplicate
bytecount-to-charcount work especially with large buffers.
* fileio.c (Finsert_file_contents_internal):
Update this to pass charcount information to
buffer_insert_string_1(), if that is available from the lstream code.
* insdel.c:
* insdel.c (buffer_insert_string_1):
Add a new CCLEN argument, giving the character count of the string
to insert. It can be -1 to indicate that te function should work
it out itself using bytecount_to_charcount(), as it used to.
* insdel.c (buffer_insert_raw_string_1):
* insdel.c (buffer_insert_lisp_string_1):
* insdel.c (buffer_insert_ascstring_1):
* insdel.c (buffer_insert_emacs_char_1):
* insdel.c (buffer_insert_from_buffer_1):
* insdel.c (buffer_replace_char):
Update these functions to use the new calling convention.
* insdel.h:
* insdel.h (buffer_insert_string):
Update this header to reflect the new buffer_insert_string_1()
argument.
* lstream.c (Lstream_character_tell): New.
Return the number of characters *read* and seen by the consumer so
far, taking into account the unget buffer, and buffered reading.
* lstream.c (Lstream_unread):
Update unget_character_count here as appropriate.
* lstream.c (Lstream_rewind):
Reset unget_character_count here too.
* lstream.h:
* lstream.h (struct lstream):
Provide the character_tell method, add a new field,
unget_character_count, giving the number of characters ever passed
to Lstream_unread().
Declare Lstream_character_tell().
Make Lstream_ungetc(), which happens to be unused, an inline
function rather than a macro, in the course of updating it to
modify unget_character_count.
* print.c (output_string):
Use the new argument to buffer_insert_string_1().
* tests.c:
* tests.c (Ftest_character_tell):
New test function.
* tests.c (syms_of_tests):
Make it available.
* unicode.c:
* unicode.c (struct unicode_coding_stream):
* unicode.c (unicode_character_tell):
New method.
* unicode.c (unicode_convert):
Update the character counter as appropriate.
* unicode.c (coding_system_type_create_unicode):
Make the character_tell method available.
author | Aidan Kehoe <kehoea@parhasard.net> |
---|---|
date | Thu, 16 Jan 2014 16:27:52 +0000 |
parents | 4004c3266c09 |
children | ccaa851ae712 |
files | src/ChangeLog src/file-coding.c src/file-coding.h src/fileio.c src/insdel.c src/insdel.h src/lstream.c src/lstream.h src/print.c src/tests.c src/unicode.c |
diffstat | 11 files changed, 552 insertions(+), 51 deletions(-) [+] |
line wrap: on
line diff
--- a/src/ChangeLog Sun Dec 22 10:36:33 2013 +0000 +++ b/src/ChangeLog Thu Jan 16 16:27:52 2014 +0000 @@ -1,3 +1,82 @@ +2014-01-16 Aidan Kehoe <kehoea@parhasard.net> + + Pass character count information from the no-conversion and + unicode coding systems to the buffer insertion code, making + #'find-file on large buffers a little snappier (if + ERROR_CHECK_TEXT is not defined). + + * file-coding.c: + * file-coding.c (coding_character_tell): New. + * file-coding.c (conversion_coding_stream_description): New. + * file-coding.c (no_conversion_convert): + Update characters_seen when decoding. + * file-coding.c (no_conversion_character_tell): New. + * file-coding.c (lstream_type_create_file_coding): Create the + no_conversion type with data. + * file-coding.c (coding_system_type_create): + Make the character_tell method available here. + * file-coding.h: + * file-coding.h (struct coding_system_methods): + Add a new character_tell() method, passing charcount information + from the coding systems to the buffer code, avoiding duplicate + bytecount-to-charcount work especially with large buffers. + + * fileio.c (Finsert_file_contents_internal): + Update this to pass charcount information to + buffer_insert_string_1(), if that is available from the lstream code. + + * insdel.c: + * insdel.c (buffer_insert_string_1): + Add a new CCLEN argument, giving the character count of the string + to insert. It can be -1 to indicate that te function should work + it out itself using bytecount_to_charcount(), as it used to. + * insdel.c (buffer_insert_raw_string_1): + * insdel.c (buffer_insert_lisp_string_1): + * insdel.c (buffer_insert_ascstring_1): + * insdel.c (buffer_insert_emacs_char_1): + * insdel.c (buffer_insert_from_buffer_1): + * insdel.c (buffer_replace_char): + Update these functions to use the new calling convention. + * insdel.h: + * insdel.h (buffer_insert_string): + Update this header to reflect the new buffer_insert_string_1() + argument. + + * lstream.c (Lstream_character_tell): New. + Return the number of characters *read* and seen by the consumer so + far, taking into account the unget buffer, and buffered reading. + + * lstream.c (Lstream_unread): + Update unget_character_count here as appropriate. + * lstream.c (Lstream_rewind): + Reset unget_character_count here too. + + * lstream.h: + * lstream.h (struct lstream): + Provide the character_tell method, add a new field, + unget_character_count, giving the number of characters ever passed + to Lstream_unread(). + Declare Lstream_character_tell(). + Make Lstream_ungetc(), which happens to be unused, an inline + function rather than a macro, in the course of updating it to + modify unget_character_count. + + * print.c (output_string): + Use the new argument to buffer_insert_string_1(). + * tests.c: + * tests.c (Ftest_character_tell): + New test function. + * tests.c (syms_of_tests): + Make it available. + * unicode.c: + * unicode.c (struct unicode_coding_stream): + * unicode.c (unicode_character_tell): + New method. + * unicode.c (unicode_convert): + Update the character counter as appropriate. + * unicode.c (coding_system_type_create_unicode): + Make the character_tell method available. + 2013-12-19 Aidan Kehoe <kehoea@parhasard.net> * text.c:
--- a/src/file-coding.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/file-coding.c Thu Jan 16 16:27:52 2014 +0000 @@ -1990,6 +1990,14 @@ return Lstream_seekable_p (str->other_end); } +static Charcount +coding_character_tell (Lstream *stream) +{ + struct coding_stream *str = CODING_STREAM_DATA (stream); + + return XCODESYSMETH_OR_GIVEN (str->codesys, character_tell, (str), -1); +} + static int coding_flusher (Lstream *stream) { @@ -2823,7 +2831,32 @@ #### Shouldn't we _call_ it that, then? And while we're at it, separate it into "to_internal" and "to_external"? */ -DEFINE_CODING_SYSTEM_TYPE (no_conversion); + + +struct no_conversion_coding_system +{ +}; + +struct no_conversion_coding_stream +{ + /* Number of characters seen when decoding. */ + Charcount characters_seen; +}; + +static const struct memory_description no_conversion_coding_system_description[] = { + { XD_END } +}; + +static const struct memory_description no_conversion_coding_stream_description_1 [] = { + { XD_INT, offsetof (struct no_conversion_coding_stream, characters_seen) }, + { XD_END } +}; + +const struct sized_memory_description no_conversion_coding_stream_description = { + sizeof (struct no_conversion_coding_stream), no_conversion_coding_stream_description_1 +}; + +DEFINE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion); /* This is used when reading in "binary" files -- i.e. files that may contain all 256 possible byte values and that are not to be @@ -2846,6 +2879,9 @@ DECODE_ADD_BINARY_CHAR (c, dst); } + CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen + += orign; + if (str->eof) DECODE_OUTPUT_PARTIAL_CHAR (ch, dst); } @@ -2904,6 +2940,12 @@ return orign; } +static Charcount +no_conversion_character_tell (struct coding_stream *str) +{ + return CODING_STREAM_TYPE_DATA (str, no_conversion)->characters_seen; +} + DEFINE_DETECTOR (no_conversion); DEFINE_DETECTOR_CATEGORY (no_conversion, no_conversion); @@ -4656,6 +4698,7 @@ LSTREAM_HAS_METHOD (coding, writer); LSTREAM_HAS_METHOD (coding, rewinder); LSTREAM_HAS_METHOD (coding, seekable_p); + LSTREAM_HAS_METHOD (coding, character_tell); LSTREAM_HAS_METHOD (coding, marker); LSTREAM_HAS_METHOD (coding, flusher); LSTREAM_HAS_METHOD (coding, closer); @@ -4697,9 +4740,10 @@ dump_add_opaque_int (&coding_detector_count); dump_add_opaque_int (&coding_detector_category_count); - INITIALIZE_CODING_SYSTEM_TYPE (no_conversion, - "no-conversion-coding-system-p"); + INITIALIZE_CODING_SYSTEM_TYPE_WITH_DATA (no_conversion, + "no-conversion-coding-system-p"); CODING_SYSTEM_HAS_METHOD (no_conversion, convert); + CODING_SYSTEM_HAS_METHOD (no_conversion, character_tell); INITIALIZE_DETECTOR (no_conversion); DETECTOR_HAS_METHOD (no_conversion, detect);
--- a/src/file-coding.h Sun Dec 22 10:36:33 2013 +0000 +++ b/src/file-coding.h Thu Jan 16 16:27:52 2014 +0000 @@ -353,6 +353,9 @@ a result of the stream being rewound. Optional. */ void (*rewind_coding_stream_method) (struct coding_stream *str); + /* Return the number of characters *decoded*. Optional. */ + Charcount (*character_tell_method) (struct coding_stream *str); + /* Finalize coding stream method: Clean up the type-specific data attached to the coding stream (i.e. in struct TYPE_coding_stream). Happens when the Lstream is deleted using Lstream_delete() or is
--- a/src/fileio.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/fileio.c Thu Jan 16 16:27:52 2014 +0000 @@ -3180,6 +3180,7 @@ struct gcpro ngcpro1; Lisp_Object stream = make_filedesc_input_stream (fd, 0, total, LSTR_ALLOW_QUIT); + Charcount last_tell = -1; NGCPRO1 (stream); Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); @@ -3187,6 +3188,7 @@ (XLSTREAM (stream), get_coding_system_for_text_file (codesys, 1), CODING_DECODE, 0); Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + last_tell = Lstream_character_tell (XLSTREAM (stream)); record_unwind_protect (delete_stream_unwind, stream); @@ -3196,7 +3198,7 @@ while (1) { Bytecount this_len; - Charcount cc_inserted; + Charcount cc_inserted, this_tell = last_tell; QUIT; this_len = Lstream_read (XLSTREAM (stream), read_buf, @@ -3209,12 +3211,17 @@ break; } - cc_inserted = buffer_insert_raw_string_1 (buf, cur_point, read_buf, - this_len, - !NILP (visit) - ? INSDEL_NO_LOCKING : 0); + cc_inserted + = buffer_insert_string_1 (buf, cur_point, read_buf, Qnil, + 0, this_len, last_tell >= 0 + ? (this_tell + = Lstream_character_tell (XLSTREAM + (stream))) + - last_tell : -1, + !NILP (visit) ? INSDEL_NO_LOCKING : 0); inserted += cc_inserted; cur_point += cc_inserted; + last_tell = this_tell; } if (!NILP (used_codesys)) {
--- a/src/insdel.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/insdel.c Thu Jan 16 16:27:52 2014 +0000 @@ -1039,14 +1039,15 @@ #endif } -/* Insert a string into BUF at Charbpos POS. The string data comes - from one of two sources: constant, non-relocatable data (specified - in NONRELOC), or a Lisp string object (specified in RELOC), which - is relocatable and may have extent data that needs to be copied - into the buffer. OFFSET and LENGTH specify the substring of the - data that is actually to be inserted. As a special case, if POS - is -1, insert the string at point and move point to the end of the - string. +/* Insert a string into BUF at Charbpos POS. The string data comes from one + of two sources: constant, non-relocatable data (specified in NONRELOC), + or a Lisp string object (specified in RELOC), which is relocatable and + may have extent data that needs to be copied into the buffer. OFFSET and + LENGTH specify the substring of the data that is actually to be inserted. + As a special case, if POS is -1, insert the string at point and move + point to the end of the string. CCLEN is the character count of the data + to be inserted, and can be -1 to indicate that buffer_insert_string_1 () + should work this out itself with bytecount_to_charcount(). Normally, markers at the insertion point end up before the inserted string. If INSDEL_BEFORE_MARKERS is set in flags, however, @@ -1061,13 +1062,12 @@ buffer_insert_string_1 (struct buffer *buf, Charbpos pos, const Ibyte *nonreloc, Lisp_Object reloc, Bytecount offset, Bytecount length, - int flags) + Charcount cclen, int flags) { /* This function can GC */ struct gcpro gcpro1; Bytebpos bytepos; Bytecount length_in_buffer; - Charcount cclen; int move_point = 0; struct buffer *mbuf; Lisp_Object bufcons; @@ -1118,14 +1118,27 @@ bytepos = charbpos_to_bytebpos (buf, pos); - /* string may have been relocated up to this point */ - if (STRINGP (reloc)) + if (cclen < 0) { - cclen = string_offset_byte_to_char_len (reloc, offset, length); - nonreloc = XSTRING_DATA (reloc); + /* string may have been relocated up to this point */ + if (STRINGP (reloc)) + { + cclen = string_offset_byte_to_char_len (reloc, offset, length); + nonreloc = XSTRING_DATA (reloc); + } + else + cclen = bytecount_to_charcount (nonreloc + offset, length); } else - cclen = bytecount_to_charcount (nonreloc + offset, length); + { + text_checking_assert (cclen > 0 && cclen + == (STRINGP (reloc) ? + string_offset_byte_to_char_len (reloc, offset, + length) + : bytecount_to_charcount (nonreloc + offset, + length))); + } + /* &&#### Here we check if the text can't fit into the format of the buffer, and if so convert it to another format (either default or 32-bit-fixed, according to some flag; if no flag, use default). */ @@ -1286,7 +1299,7 @@ { /* This function can GC */ return buffer_insert_string_1 (buf, pos, nonreloc, Qnil, 0, length, - flags); + -1, flags); } Charcount @@ -1295,8 +1308,7 @@ { /* This function can GC */ return buffer_insert_string_1 (buf, pos, 0, str, 0, - XSTRING_LENGTH (str), - flags); + XSTRING_LENGTH (str), -1, flags); } /* Insert the null-terminated string S (in external format). */ @@ -1309,7 +1321,7 @@ const CIbyte *translated = GETTEXT (s); ASSERT_ASCTEXT_ASCII (s); return buffer_insert_string_1 (buf, pos, (const Ibyte *) translated, Qnil, - 0, strlen (translated), flags); + 0, strlen (translated), -1, flags); } Charcount @@ -1319,7 +1331,7 @@ /* This function can GC */ Ibyte str[MAX_ICHAR_LEN]; Bytecount len = set_itext_ichar (str, ch); - return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, flags); + return buffer_insert_string_1 (buf, pos, str, Qnil, 0, len, -1, flags); } Charcount @@ -1339,7 +1351,7 @@ /* This function can GC */ Lisp_Object str = make_string_from_buffer (buf2, pos2, length); return buffer_insert_string_1 (buf, pos, 0, str, 0, - XSTRING_LENGTH (str), flags); + XSTRING_LENGTH (str), -1, flags); } @@ -1674,7 +1686,7 @@ * backward so that it now equals the insertion point. */ buffer_insert_string_1 (buf, (movepoint ? -1 : pos), - newstr, Qnil, 0, newlen, 0); + newstr, Qnil, 0, newlen, -1, 0); } }
--- a/src/insdel.h Sun Dec 22 10:36:33 2013 +0000 +++ b/src/insdel.h Thu Jan 16 16:27:52 2014 +0000 @@ -38,7 +38,7 @@ Charcount buffer_insert_string_1 (struct buffer *buf, Charbpos pos, const Ibyte *nonreloc, Lisp_Object reloc, Bytecount offset, Bytecount length, - int flags); + Charcount clen, int flags); Charcount buffer_insert_raw_string_1 (struct buffer *buf, Charbpos pos, const Ibyte *nonreloc, Bytecount length, int flags); @@ -58,7 +58,7 @@ All of these can GC. */ #define buffer_insert_string(buf, nonreloc, reloc, offset, length) \ - buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, 0) + buffer_insert_string_1 (buf, -1, nonreloc, reloc, offset, length, -1, 0) #define buffer_insert_raw_string(buf, string, length) \ buffer_insert_raw_string_1 (buf, -1, string, length, 0) #define buffer_insert_ascstring(buf, s) \
--- a/src/lstream.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/lstream.c Thu Jan 16 16:27:52 2014 +0000 @@ -735,6 +735,134 @@ return Lstream_read_1 (lstr, data, size, 0); } +Charcount +Lstream_character_tell (Lstream *lstr) +{ + Charcount ctell = lstr->imp->character_tell ? + lstr->imp->character_tell (lstr) : -1; + + if (ctell >= 0) + { + /* Our implementation's character tell code doesn't know about the + unget buffer, update its figure to reflect it. */ + ctell += lstr->unget_character_count; + + if (lstr->unget_buffer_ind > 0) + { + /* The character count should not include those characters + currently *in* the unget buffer, subtract that count. */ + Ibyte *ungot, *ungot_ptr; + Bytecount ii = lstr->unget_buffer_ind, impartial, sevenflen; + + ungot_ptr = ungot + = alloca_ibytes (lstr->unget_buffer_ind) + MAX_ICHAR_LEN; + + /* Make sure the string starts with a valid ibyteptr, otherwise + validate_ibyte_string_backward could run off the beginning. */ + sevenflen = set_itext_ichar (ungot, (Ichar) 0x7f); + ungot_ptr += sevenflen; + + /* Internal format data, but in reverse order. There's not + actually a need to alloca here, we could work out the character + count directly from the reversed bytes, but the alloca approach + is more robust to changes in our internal format, and the unget + buffer is not going to blow the stack. */ + while (ii > 0) + { + *ungot_ptr++ = lstr->unget_buffer[--ii]; + } + + impartial + = validate_ibyte_string_backward (ungot, ungot_ptr - ungot); + + /* Move past the character we added. */ + impartial -= sevenflen; + INC_IBYTEPTR (ungot); + + if (impartial > 0 && !valid_ibyteptr_p (ungot)) + { + Ibyte *newstart = ungot, *limit = ungot + impartial; + /* Our consumer has the start of a partial character, we + have the rest. */ + + while (!valid_ibyteptr_p (newstart) && newstart < limit) + { + newstart++, impartial--; + } + + /* Remove this character from the count, since the + end-consumer hasn't seen the full character. */ + ctell--; + ungot = newstart; + } + else if (valid_ibyteptr_p (ungot) + && rep_bytes_by_first_byte (*ungot) > impartial) + { + /* Rest of a partial character has yet to be read, its first + octet has probably been unread by Lstream_read_1(). We + included it in the accounting in Lstream_unread(), adjust + the figure here appropriately. */ + ctell--; + } + + /* bytecount_to_charcount will throw an assertion failure if we're + not at the start of a character. */ + text_checking_assert (impartial == 0 || valid_ibyteptr_p (ungot)); + + /* The character length of this text is included in + unget_character_count; if the bytes are still in the unget + buffer, then our consumers haven't seen them, and so the + character tell figure shouldn't reflect them. Subtract it from + the total. */ + ctell -= bytecount_to_charcount (ungot, impartial); + } + + if (lstr->in_buffer_ind < lstr->in_buffer_current) + { + Ibyte *inbuf = lstr->in_buffer + lstr->in_buffer_ind; + Bytecount partial = lstr->in_buffer_current - lstr->in_buffer_ind, + impartial; + + if (!valid_ibyteptr_p (inbuf)) + { + Ibyte *newstart = inbuf; + Ibyte *limit = lstr->in_buffer + lstr->in_buffer_current; + /* Our consumer has the start of a partial character, we + have the rest. */ + + while (newstart < limit && !valid_ibyteptr_p (newstart)) + { + newstart++; + } + + /* Remove this character from the count, since the + end-consumer hasn't seen the full character. */ + ctell--; + inbuf = newstart; + partial = limit - newstart; + } + + if (valid_ibyteptr_p (inbuf)) + { + /* There's at least one valid starting char in the string, + validate_ibyte_string_backward won't run off the + begining. */ + impartial = + validate_ibyte_string_backward (inbuf, partial); + } + else + { + impartial = 0; + } + + ctell -= bytecount_to_charcount (inbuf, impartial); + } + + text_checking_assert (ctell >= 0); + } + + return ctell; +} /* Push back SIZE bytes of DATA onto the input queue. The next call to Lstream_read() with the same size will read the same bytes back. @@ -755,7 +883,12 @@ /* Bytes have to go on in reverse order -- they are reversed again when read back. */ while (size--) - lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size]; + { + lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size]; + /* If we see a valid first byte, that is the last octet in a + character, so increase the count of ungot characters. */ + lstr->unget_character_count += valid_ibyteptr_p (p + size); + } } /* Rewind the stream to the beginning. */ @@ -768,6 +901,7 @@ if (Lstream_flush (lstr) < 0) return -1; lstr->byte_count = 0; + lstr->unget_character_count = 0; return (lstr->imp->rewinder) (lstr); }
--- a/src/lstream.h Sun Dec 22 10:36:33 2013 +0000 +++ b/src/lstream.h Thu Jan 16 16:27:52 2014 +0000 @@ -181,6 +181,10 @@ method. If this method is not present, the result is determined by whether a rewind method is present. */ int (*seekable_p) (Lstream *stream); + + /* Return the number of complete characters read so far. Respects + buffering and unget. Returns -1 if unknown or not implemented. */ + Charcount (*character_tell) (Lstream *stream); /* Perform any additional operations necessary to flush the data in this stream. */ int (*flusher) (Lstream *stream); @@ -250,8 +254,9 @@ similarly has to push the data on backwards. */ unsigned char *unget_buffer; /* holds characters pushed back onto input */ Bytecount unget_buffer_size; /* allocated size of buffer */ - Bytecount unget_buffer_ind; /* pointer to next buffer spot - to write a character */ + Bytecount unget_buffer_ind; /* Next buffer spot to write a character */ + + Charcount unget_character_count; /* Count of complete characters ever ungot. */ Bytecount byte_count; int flags; @@ -297,8 +302,8 @@ int Lstream_fputc (Lstream *lstr, int c); int Lstream_fgetc (Lstream *lstr); void Lstream_fungetc (Lstream *lstr, int c); -Bytecount Lstream_read (Lstream *lstr, void *data, - Bytecount size); +Bytecount Lstream_read (Lstream *lstr, void *data, Bytecount size); +Charcount Lstream_character_tell (Lstream *); int Lstream_write (Lstream *lstr, const void *data, Bytecount size); int Lstream_was_blocked_p (Lstream *lstr); @@ -353,19 +358,28 @@ reverse order they were pushed back -- most recent first. (This is necessary for consistency -- if there are a number of bytes that have been unread and I read and unread a byte, it needs to be the - first to be read again.) This is a macro and so it is very - efficient. The C argument is only evaluated once but the STREAM - argument is evaluated more than once. - */ + first to be read again.) */ -#define Lstream_ungetc(stream, c) \ -/* Add to the end if it won't overflow buffer; otherwise call the \ - function equivalent */ \ - ((stream)->unget_buffer_ind >= (stream)->unget_buffer_size ? \ - Lstream_fungetc (stream, c) : \ - (void) ((stream)->byte_count--, \ - ((stream)->unget_buffer[(stream)->unget_buffer_ind++] = \ - (unsigned char) (c)))) +DECLARE_INLINE_HEADER ( +void +Lstream_ungetc (Lstream *lstr, int c) +) +{ + /* Add to the end if it won't overflow buffer; otherwise call the + function equivalent */ + if (lstr->unget_buffer_ind >= lstr->unget_buffer_size) + { + Lstream_fungetc (lstr, c); + } + else + { + lstr->byte_count--; + lstr->unget_buffer[lstr->unget_buffer_ind] = (unsigned char) (c); + lstr->unget_character_count + += valid_ibyteptr_p (lstr->unget_buffer + lstr->unget_buffer_ind); + lstr->unget_buffer_ind++; + } +} #define Lstream_data(stream) ((void *) ((stream)->data)) #define Lstream_byte_count(stream) ((stream)->byte_count)
--- a/src/print.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/print.c Thu Jan 16 16:27:52 2014 +0000 @@ -514,7 +514,7 @@ buffer_insert_string_1 (XMARKER (function)->buffer, spoint, nonreloc, reloc, offset, len, - 0); + -1, 0); Fset_marker (function, make_fixnum (spoint + cclen), Fmarker_buffer (function)); }
--- a/src/tests.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/tests.c Thu Jan 16 16:27:52 2014 +0000 @@ -558,6 +558,186 @@ return conversion_result; } +DEFUN ("test-character-tell", Ftest_character_tell, 0, 0, "", /* +Return list of results of tests of the stream character offset code. +For use by the automated test suite. See tests/automated/c-tests. + +Each element is a list (DESCRIPTION, STATUS, REASON). +DESCRIPTION is a string describing the test. +STATUS is a symbol, either t (pass) or nil (fail). +REASON is nil or a string describing the failure (not required). +*/ + ()) +{ + Extbyte ext_unix[]= "\n\nfoo\nbar\n\nf\372b\343\340\nfoo\nbar\n"; + /* Previous string in UTF-8. */ + Extbyte ext_utf_8_unix[] + = "\n\nfoo\nbar\n\nf\303\272b\303\243\303\240\nfoo\nbar\n"; + Charcount ext_utf_8_unix_char_len = 25; + Ibyte shortbuf[13], longbuf[512]; + Lisp_Object stream = + make_fixed_buffer_input_stream (ext_unix, sizeof (ext_unix) - 1); + Lisp_Object result = Qnil, string = Qnil; + Charcount count; + Bytecount bytecount; + struct gcpro gcpro1, gcpro2, gcpro3; + +#define CHARACTER_TELL_ASSERT(assertion, description, failing_case) \ + do \ + { \ + if (assertion) \ + result = Fcons (list3 (build_cistring (description), \ + Qt, Qnil), result); \ + else \ + result = Fcons (list3 (build_cistring (description), \ + Qnil, build_ascstring (failing_case)), \ + result); \ + } \ + while (0) + + GCPRO3 (stream, result, string); + + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + stream = make_coding_input_stream + (XLSTREAM (stream), Ffind_coding_system (intern ("no-conversion-unix")), + CODING_DECODE, 0); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + + bytecount = Lstream_read (XLSTREAM (stream), longbuf, sizeof (longbuf)); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == sizeof (ext_unix) -1, + "basic character tell, no-conversion-unix", + "basic character tell failed"); + + string = build_extstring (ext_unix, + Ffind_coding_system (intern + ("no-conversion-unix"))); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == string_char_length (string), + "repeat basic character tell, no-conversion-unix", + "repeat basic character tell failed with string"); + + count = Lstream_character_tell (XLSTREAM (stream)); + + Lstream_unread (XLSTREAM (stream), "r\n", 2); + + /* This should give the same result as before the unread. */ + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == count, "checking post-unread character tell", + "post-unread character tell failed"); + bytecount += Lstream_read (XLSTREAM (stream), longbuf + bytecount, + sizeof (longbuf) - bytecount); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == count + 2, + "checking post-unread+read character tell", + "post-unread+read character tell failed"); + + /* This seems to be buggy for my purposes. */ + /* Lstream_rewind (XLSTREAM (stream)); */ + Lstream_close (XLSTREAM (stream)); + Lstream_delete (XLSTREAM (stream)); + + stream = make_fixed_buffer_input_stream (ext_unix, sizeof (ext_unix) - 1); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + Lstream_unset_character_mode (XLSTREAM (stream)); + stream = make_coding_input_stream + (XLSTREAM (stream), Ffind_coding_system (intern ("no-conversion-unix")), + CODING_DECODE, 0); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + Lstream_unset_character_mode (XLSTREAM (stream)); + + bytecount = Lstream_read (XLSTREAM (stream), shortbuf, sizeof (shortbuf)); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + /* This should be equal to sizeof (shortbuf) on + non-mule. */ + == sizeof (shortbuf) - !(byte_ascii_p (0xff)), + "character tell with short read, no-conversion-unix", + "short read character tell failed"); + + Lstream_close (XLSTREAM (stream)); + Lstream_delete (XLSTREAM (stream)); + + stream + = make_fixed_buffer_input_stream (ext_utf_8_unix, + sizeof (ext_utf_8_unix) - 1); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + stream = make_coding_input_stream + (XLSTREAM (stream), Ffind_coding_system (intern ("utf-8-unix")), + CODING_DECODE, 0); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + + bytecount = Lstream_read (XLSTREAM (stream), longbuf, sizeof (longbuf)); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == ext_utf_8_unix_char_len, + "utf-8 character tell, utf-8-unix", + "utf-8 character tell failed"); + + string = build_extstring (ext_utf_8_unix, + Ffind_coding_system (intern + ("utf-8-unix"))); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == string_char_length (string), + "repeat utf-8 character tell, utf-8-unix", + "repeat utf-8 character tell failed with string"); + + count = Lstream_character_tell (XLSTREAM (stream)); + + Lstream_unread (XLSTREAM (stream), "r\n", 2); + + /* This should give the same result as before the unread. */ + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == count, "checking post-unread utf-8 tell", + "post-unread utf-8 tell failed"); + bytecount += Lstream_read (XLSTREAM (stream), longbuf + bytecount, + sizeof (longbuf) - bytecount); + + CHARACTER_TELL_ASSERT (Lstream_character_tell (XLSTREAM (stream)) + == count + 2, + "checking post-unread+read utf-8 tell", + "post-unread+read utf-8 tell failed"); + + /* This seems to be buggy for my purposes. */ + /* Lstream_rewind (XLSTREAM (stream)); */ + Lstream_close (XLSTREAM (stream)); + Lstream_delete (XLSTREAM (stream)); + + stream = make_fixed_buffer_input_stream (ext_utf_8_unix, sizeof (ext_utf_8_unix) - 1); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + Lstream_set_character_mode (XLSTREAM (stream)); + + stream = make_coding_input_stream + (XLSTREAM (stream), Ffind_coding_system (intern ("utf-8-unix")), + CODING_DECODE, 0); + Lstream_set_buffering (XLSTREAM (stream), LSTREAM_BLOCKN_BUFFERED, 65536); + Lstream_set_character_mode (XLSTREAM (stream)); + + bytecount = Lstream_read (XLSTREAM (stream), shortbuf, sizeof (shortbuf)); + + CHARACTER_TELL_ASSERT + (bytecount == (sizeof (shortbuf) - 1), + "utf-8 Lstream_read, character mode, checking partial char not read", + "partial char appars to have been read when it shouldn't"); + + CHARACTER_TELL_ASSERT + (Lstream_character_tell (XLSTREAM (stream)) + /* This is shorter, because it's in the middle of a character. */ + == sizeof (shortbuf) - 1, + "utf-8 tell with short read, character mode, utf-8-unix", + "utf-8 read character tell, character mode failed"); + + Lstream_close (XLSTREAM (stream)); + Lstream_delete (XLSTREAM (stream)); + + UNGCPRO; + return result; +} + /* Hash Table testing */ @@ -724,6 +904,7 @@ Vtest_function_list = Qnil; TESTS_DEFSUBR (Ftest_data_format_conversion); + TESTS_DEFSUBR (Ftest_character_tell); TESTS_DEFSUBR (Ftest_hash_tables); TESTS_DEFSUBR (Ftest_store_void_in_lisp); /* Add other test functions here with TESTS_DEFSUBR */
--- a/src/unicode.c Sun Dec 22 10:36:33 2013 +0000 +++ b/src/unicode.c Thu Jan 16 16:27:52 2014 +0000 @@ -1707,6 +1707,7 @@ unsigned char counter; unsigned char indicated_length; int seen_char; + Charcount characters_seen; /* encode */ Lisp_Object current_charset; int current_char_boundary; @@ -1988,6 +1989,17 @@ write_error_characters_as_such); } +static Charcount +unicode_character_tell (struct coding_stream *str) +{ + if (CODING_STREAM_TYPE_DATA (str, unicode)->counter == 0) + { + return CODING_STREAM_TYPE_DATA (str, unicode)->characters_seen; + } + + return -1; +} + static Bytecount unicode_convert (struct coding_stream *str, const UExtbyte *src, unsigned_char_dynarr *dst, Bytecount n) @@ -2006,6 +2018,7 @@ unsigned char counter = data->counter; unsigned char indicated_length = data->indicated_length; + Charcount characters_seen = data->characters_seen; while (n--) { @@ -2020,12 +2033,15 @@ { /* ASCII. */ decode_unicode_char (c, dst, data, ignore_bom); + characters_seen++; } else if (0 == (c & 0x40)) { /* Highest bit set, second highest not--there's something wrong. */ DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + /* This is a character in the buffer. */ + characters_seen++; } else if (0 == (c & 0x20)) { @@ -2050,7 +2066,7 @@ /* We don't supports lengths longer than 4 in external-format data. */ DECODE_ERROR_OCTET (c, dst, data, ignore_bom); - + characters_seen++; } } else @@ -2061,15 +2077,20 @@ indicate_invalid_utf_8(indicated_length, counter, ch, dst, data, ignore_bom); + /* These are characters our receiver will see, not + actual characters we've seen in the input. */ + characters_seen += (indicated_length - counter); if (c & 0x80) { DECODE_ERROR_OCTET (c, dst, data, ignore_bom); + characters_seen++; } else { /* The character just read is ASCII. Treat it as such. */ decode_unicode_char (c, dst, data, ignore_bom); + characters_seen++; } ch = 0; counter = 0; @@ -2092,10 +2113,12 @@ counter, ch, dst, data, ignore_bom); + characters_seen += (indicated_length - counter); } else { decode_unicode_char (ch, dst, data, ignore_bom); + characters_seen++; } ch = 0; } @@ -2242,6 +2265,7 @@ indicate_invalid_utf_8(indicated_length, counter, ch, dst, data, ignore_bom); + characters_seen += (indicated_length - counter); break; case UNICODE_UTF_16: @@ -2295,6 +2319,7 @@ data->counter = counter; data->indicated_length = indicated_length; + data->characters_seen = characters_seen; } else { @@ -3177,6 +3202,8 @@ CODING_SYSTEM_HAS_METHOD (unicode, putprop); CODING_SYSTEM_HAS_METHOD (unicode, getprop); + CODING_SYSTEM_HAS_METHOD (unicode, character_tell); + INITIALIZE_DETECTOR (utf_8); DETECTOR_HAS_METHOD (utf_8, detect); INITIALIZE_DETECTOR_CATEGORY (utf_8, utf_8);