comparison src/lstream.c @ 5776:65d65b52d608

Pass character count from coding systems to buffer insertion code. src/ChangeLog addition: 2014-01-16 Aidan Kehoe <kehoea@parhasard.net> Pass character count information from the no-conversion and unicode coding systems to the buffer insertion code, making #'find-file on large buffers a little snappier (if ERROR_CHECK_TEXT is not defined). * file-coding.c: * file-coding.c (coding_character_tell): New. * file-coding.c (conversion_coding_stream_description): New. * file-coding.c (no_conversion_convert): Update characters_seen when decoding. * file-coding.c (no_conversion_character_tell): New. * file-coding.c (lstream_type_create_file_coding): Create the no_conversion type with data. * file-coding.c (coding_system_type_create): Make the character_tell method available here. * file-coding.h: * file-coding.h (struct coding_system_methods): Add a new character_tell() method, passing charcount information from the coding systems to the buffer code, avoiding duplicate bytecount-to-charcount work especially with large buffers. * fileio.c (Finsert_file_contents_internal): Update this to pass charcount information to buffer_insert_string_1(), if that is available from the lstream code. * insdel.c: * insdel.c (buffer_insert_string_1): Add a new CCLEN argument, giving the character count of the string to insert. It can be -1 to indicate that te function should work it out itself using bytecount_to_charcount(), as it used to. * insdel.c (buffer_insert_raw_string_1): * insdel.c (buffer_insert_lisp_string_1): * insdel.c (buffer_insert_ascstring_1): * insdel.c (buffer_insert_emacs_char_1): * insdel.c (buffer_insert_from_buffer_1): * insdel.c (buffer_replace_char): Update these functions to use the new calling convention. * insdel.h: * insdel.h (buffer_insert_string): Update this header to reflect the new buffer_insert_string_1() argument. * lstream.c (Lstream_character_tell): New. Return the number of characters *read* and seen by the consumer so far, taking into account the unget buffer, and buffered reading. * lstream.c (Lstream_unread): Update unget_character_count here as appropriate. * lstream.c (Lstream_rewind): Reset unget_character_count here too. * lstream.h: * lstream.h (struct lstream): Provide the character_tell method, add a new field, unget_character_count, giving the number of characters ever passed to Lstream_unread(). Declare Lstream_character_tell(). Make Lstream_ungetc(), which happens to be unused, an inline function rather than a macro, in the course of updating it to modify unget_character_count. * print.c (output_string): Use the new argument to buffer_insert_string_1(). * tests.c: * tests.c (Ftest_character_tell): New test function. * tests.c (syms_of_tests): Make it available. * unicode.c: * unicode.c (struct unicode_coding_stream): * unicode.c (unicode_character_tell): New method. * unicode.c (unicode_convert): Update the character counter as appropriate. * unicode.c (coding_system_type_create_unicode): Make the character_tell method available.
author Aidan Kehoe <kehoea@parhasard.net>
date Thu, 16 Jan 2014 16:27:52 +0000
parents 2dbefd79b3d3
children 0cb4f494a548
comparison
equal deleted inserted replaced
5775:4004c3266c09 5776:65d65b52d608
733 Lstream_read (Lstream *lstr, void *data, Bytecount size) 733 Lstream_read (Lstream *lstr, void *data, Bytecount size)
734 { 734 {
735 return Lstream_read_1 (lstr, data, size, 0); 735 return Lstream_read_1 (lstr, data, size, 0);
736 } 736 }
737 737
738 Charcount
739 Lstream_character_tell (Lstream *lstr)
740 {
741 Charcount ctell = lstr->imp->character_tell ?
742 lstr->imp->character_tell (lstr) : -1;
743
744 if (ctell >= 0)
745 {
746 /* Our implementation's character tell code doesn't know about the
747 unget buffer, update its figure to reflect it. */
748 ctell += lstr->unget_character_count;
749
750 if (lstr->unget_buffer_ind > 0)
751 {
752 /* The character count should not include those characters
753 currently *in* the unget buffer, subtract that count. */
754 Ibyte *ungot, *ungot_ptr;
755 Bytecount ii = lstr->unget_buffer_ind, impartial, sevenflen;
756
757 ungot_ptr = ungot
758 = alloca_ibytes (lstr->unget_buffer_ind) + MAX_ICHAR_LEN;
759
760 /* Make sure the string starts with a valid ibyteptr, otherwise
761 validate_ibyte_string_backward could run off the beginning. */
762 sevenflen = set_itext_ichar (ungot, (Ichar) 0x7f);
763 ungot_ptr += sevenflen;
764
765 /* Internal format data, but in reverse order. There's not
766 actually a need to alloca here, we could work out the character
767 count directly from the reversed bytes, but the alloca approach
768 is more robust to changes in our internal format, and the unget
769 buffer is not going to blow the stack. */
770 while (ii > 0)
771 {
772 *ungot_ptr++ = lstr->unget_buffer[--ii];
773 }
774
775 impartial
776 = validate_ibyte_string_backward (ungot, ungot_ptr - ungot);
777
778 /* Move past the character we added. */
779 impartial -= sevenflen;
780 INC_IBYTEPTR (ungot);
781
782 if (impartial > 0 && !valid_ibyteptr_p (ungot))
783 {
784 Ibyte *newstart = ungot, *limit = ungot + impartial;
785 /* Our consumer has the start of a partial character, we
786 have the rest. */
787
788 while (!valid_ibyteptr_p (newstart) && newstart < limit)
789 {
790 newstart++, impartial--;
791 }
792
793 /* Remove this character from the count, since the
794 end-consumer hasn't seen the full character. */
795 ctell--;
796 ungot = newstart;
797 }
798 else if (valid_ibyteptr_p (ungot)
799 && rep_bytes_by_first_byte (*ungot) > impartial)
800 {
801 /* Rest of a partial character has yet to be read, its first
802 octet has probably been unread by Lstream_read_1(). We
803 included it in the accounting in Lstream_unread(), adjust
804 the figure here appropriately. */
805 ctell--;
806 }
807
808 /* bytecount_to_charcount will throw an assertion failure if we're
809 not at the start of a character. */
810 text_checking_assert (impartial == 0 || valid_ibyteptr_p (ungot));
811
812 /* The character length of this text is included in
813 unget_character_count; if the bytes are still in the unget
814 buffer, then our consumers haven't seen them, and so the
815 character tell figure shouldn't reflect them. Subtract it from
816 the total. */
817 ctell -= bytecount_to_charcount (ungot, impartial);
818 }
819
820 if (lstr->in_buffer_ind < lstr->in_buffer_current)
821 {
822 Ibyte *inbuf = lstr->in_buffer + lstr->in_buffer_ind;
823 Bytecount partial = lstr->in_buffer_current - lstr->in_buffer_ind,
824 impartial;
825
826 if (!valid_ibyteptr_p (inbuf))
827 {
828 Ibyte *newstart = inbuf;
829 Ibyte *limit = lstr->in_buffer + lstr->in_buffer_current;
830 /* Our consumer has the start of a partial character, we
831 have the rest. */
832
833 while (newstart < limit && !valid_ibyteptr_p (newstart))
834 {
835 newstart++;
836 }
837
838 /* Remove this character from the count, since the
839 end-consumer hasn't seen the full character. */
840 ctell--;
841 inbuf = newstart;
842 partial = limit - newstart;
843 }
844
845 if (valid_ibyteptr_p (inbuf))
846 {
847 /* There's at least one valid starting char in the string,
848 validate_ibyte_string_backward won't run off the
849 begining. */
850 impartial =
851 validate_ibyte_string_backward (inbuf, partial);
852 }
853 else
854 {
855 impartial = 0;
856 }
857
858 ctell -= bytecount_to_charcount (inbuf, impartial);
859 }
860
861 text_checking_assert (ctell >= 0);
862 }
863
864 return ctell;
865 }
738 866
739 /* Push back SIZE bytes of DATA onto the input queue. The next call 867 /* Push back SIZE bytes of DATA onto the input queue. The next call
740 to Lstream_read() with the same size will read the same bytes back. 868 to Lstream_read() with the same size will read the same bytes back.
741 Note that this will be the case even if there is other pending 869 Note that this will be the case even if there is other pending
742 unread data. */ 870 unread data. */
753 lstr->byte_count -= size; 881 lstr->byte_count -= size;
754 882
755 /* Bytes have to go on in reverse order -- they are reversed 883 /* Bytes have to go on in reverse order -- they are reversed
756 again when read back. */ 884 again when read back. */
757 while (size--) 885 while (size--)
758 lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size]; 886 {
887 lstr->unget_buffer[lstr->unget_buffer_ind++] = p[size];
888 /* If we see a valid first byte, that is the last octet in a
889 character, so increase the count of ungot characters. */
890 lstr->unget_character_count += valid_ibyteptr_p (p + size);
891 }
759 } 892 }
760 893
761 /* Rewind the stream to the beginning. */ 894 /* Rewind the stream to the beginning. */
762 895
763 int 896 int
766 if (!lstr->imp->rewinder) 899 if (!lstr->imp->rewinder)
767 Lstream_internal_error ("lstream has no rewinder", lstr); 900 Lstream_internal_error ("lstream has no rewinder", lstr);
768 if (Lstream_flush (lstr) < 0) 901 if (Lstream_flush (lstr) < 0)
769 return -1; 902 return -1;
770 lstr->byte_count = 0; 903 lstr->byte_count = 0;
904 lstr->unget_character_count = 0;
771 return (lstr->imp->rewinder) (lstr); 905 return (lstr->imp->rewinder) (lstr);
772 } 906 }
773 907
774 int 908 int
775 Lstream_seekable_p (Lstream *lstr) 909 Lstream_seekable_p (Lstream *lstr)