Inconsistent behaviour of BufferLines



Hi,

I have notices some inconsistencies in the way BufferLines class behaves. Please see the attached test case for an example of a buggy behaviour. I expect len(x), iter(x) and iter(x[0:len(x)] to return the same length, however, due to the current implementation of the __getitem__ method iter(x) returns 1 element more than expected.
This bug affects comparison results when one of the files is empty for instance.
There are 2 possible fixes (both marked with FIXME comment and commented out in the attached file):
1) change > operator in __getitem__ to >=, >= makes more sense here anyway, but I guess > is used for some reason
2) implement __iter__ method

Regards,
Piotr

import gtk
import sys

def get_iter_at_line_or_eof(buf, line):
    if line >= buf.get_line_count():
        return buf.get_end_iter()
    return buf.get_iter_at_line(line)


class BufferLines(object):
    """gtk.TextBuffer shim with line-based access and optional filtering

    This class allows a gtk.TextBuffer to be treated as a list of lines of
    possibly-filtered text. If no filter is given, the raw output from the
    gtk.TextBuffer is used.

    The logic here (and in places in FileDiff) requires that Python's
    unicode splitlines() implementation and gtk.TextBuffer agree on where
    linebreaks occur. Happily, this is usually the case.
    """

    def __init__(self, buf, textfilter=None):
        self.buf = buf
        if textfilter is not None:
            self.textfilter = textfilter
        else:
            self.textfilter = lambda x: x

    def __getslice__(self, lo, hi):
        # FIXME: If we ask for arbitrary slices past the end of the buffer,
        # this will return the last line.
        start = get_iter_at_line_or_eof(self.buf, lo)
        end = get_iter_at_line_or_eof(self.buf, hi)
        txt = unicode(self.buf.get_text(start, end, False), 'utf8')

        filter_txt = self.textfilter(txt)
        lines = filter_txt.splitlines()
        ends = filter_txt.splitlines(True)

        # The last line in a gtk.TextBuffer is guaranteed never to end in a
        # newline. As splitlines() discards an empty line at the end, we need
        # to artificially add a line if the requested slice is past the end of
        # the buffer, and the last line in the slice ended in a newline.
        if hi >= self.buf.get_line_count() and \
           (len(lines) == 0 or len(lines[-1]) != len(ends[-1])):
            lines.append(u"")
            ends.append(u"")

        hi = self.buf.get_line_count() if hi == sys.maxint else hi
        if hi - lo != len(lines):
            # These codepoints are considered line breaks by Python, but not
            # by GtkTextStore.
            additional_breaks = set((u'\x0c', u'\x85'))
            i = 0
            while i < len(ends):
                line, end = lines[i], ends[i]
                # It's possible that the last line in a file would end in a
                # line break character, which requires no joining.
                if end and end[-1] in additional_breaks and \
                   (not line or line[-1] not in additional_breaks):
                    assert len(ends) >= i + 1
                    lines[i:i + 2] = [line + end[-1] + lines[i + 1]]
                    ends[i:i + 2] = [end + ends[i + 1]]
                i += 1

        return lines

    def __getitem__(self, i):
# FIXME
# Either change > to >=
#        if i >= len(self):
        if i > len(self):
            raise IndexError
        line_start = get_iter_at_line_or_eof(self.buf, i)
        line_end = line_start.copy()
        if not line_end.ends_line():
            line_end.forward_to_line_end()
        txt = self.buf.get_text(line_start, line_end, False)
        return unicode(self.textfilter(txt), 'utf8')
# FIXME
# or add __iter__ method
    #def __iter__(self):
    #    for l in self[0:len(self)]:
    #        yield l
        
    def __len__(self):
        return self.buf.get_line_count()

def main():
    textbuffer = gtk.TextBuffer()
    bl = BufferLines(textbuffer)
    print "len(bl) =", len(bl)
    l = 0
    for it in iter(bl):
        l += 1
    print "iter(bl) =", l
    l = 0
    for it in iter(bl[0:len(bl)]):
        l += 1
    print "iter(bl[0:len(bl)]) =", l
    
if __name__ == "__main__":
    main()


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]