[extensions-web] Add a new file diff view



commit bc4b1bf31a78d85b2043dda2a33d19a199e3920d
Author: Jasper St. Pierre <jstpierre mecheye net>
Date:   Mon Nov 14 21:13:39 2011 -0500

    Add a new file diff view
    
    A lot of code is stolen from ReviewBoard. Licensed under MIT, the copyright
    is the ReviewBoard team.

 sweettooth/review/diffutils.py | 1133 ++++++++++++++++++++++++++++++++++++++++
 sweettooth/review/diffview.py  |   98 ++++
 sweettooth/review/htmldiff.py  |   70 +++
 sweettooth/review/urls.py      |    1 +
 sweettooth/review/views.py     |   78 +++-
 5 files changed, 1371 insertions(+), 9 deletions(-)
---
diff --git a/sweettooth/review/diffutils.py b/sweettooth/review/diffutils.py
new file mode 100644
index 0000000..1db83a1
--- /dev/null
+++ b/sweettooth/review/diffutils.py
@@ -0,0 +1,1133 @@
+
+# The code in this file is adapted from ReviewBoard, MIT licensed
+# https://github.com/reviewboard/reviewboard
+# Copyright 2011 Review Board Team
+
+import re
+from difflib import SequenceMatcher
+
+NEWLINES_RE = re.compile(r'\r?\n')
+
+class MyersDiffer:
+    """
+    An implementation of Eugene Myers's O(ND) Diff algorithm based on GNU diff.
+    """
+    SNAKE_LIMIT = 20
+
+    DISCARD_NONE = 0
+    DISCARD_FOUND = 1
+    DISCARD_CANCEL = 2
+
+    # The Myers diff algorithm effectively turns the diff problem into a graph
+    # search.  It works by finding the "shortest middle snake," which
+
+    class DiffData:
+        def __init__(self, data):
+            self.data = data
+            self.length = len(data)
+            self.modified = {}
+            self.undiscarded = []
+            self.undiscarded_lines = 0
+            self.real_indexes = []
+
+    def __init__(self, a, b, ignore_space=False):
+        if type(a) != type(b):
+            raise TypeError
+
+        self.a = a
+        self.b = b
+        self.code_table = {}
+        self.last_code = 0
+        self.a_data = self.b_data = None
+        self.ignore_space = ignore_space
+        self.minimal_diff = False
+        self.interesting_line_regexes = []
+        self.interesting_lines = [{}, {}]
+        self.interesting_line_table = {}
+
+        # SMS State
+        self.max_lines = 0
+        self.fdiag = None
+        self.bdiag = None
+
+    def ratio(self):
+        self._gen_diff_data()
+        a_equals = self.a_data.length - len(self.a_data.modified)
+        b_equals = self.b_data.length - len(self.b_data.modified)
+
+        return 1.0 * (a_equals + b_equals) / \
+                     (self.a_data.length + self.b_data.length)
+
+    def add_interesting_line_regex(self, name, regex):
+        """Registers a regular expression used to look for interesting lines.
+
+        All interesting lines found that match the regular expression will
+        be stored and tagged with the given name. Callers can use
+        get_interesting_lines to get the results.
+        """
+        self.interesting_line_regexes.append((name, regex))
+        self.interesting_lines[0][name] = []
+        self.interesting_lines[1][name] = []
+
+    def get_interesting_lines(self, name, is_modified_file):
+        """Returns the interesting lines tagged with the given name."""
+        if is_modified_file:
+            index = 1
+        else:
+            index = 0
+
+        return self.interesting_lines[index].get(name, [])
+
+    def get_opcodes(self):
+        """
+        Generator that returns opcodes representing the contents of the
+        diff.
+
+        The resulting opcodes are in the format of
+        (tag, i1, i2, j1, j2)
+        """
+        self._gen_diff_data()
+
+        a_line = b_line = 0
+        last_group = None
+
+        # Go through the entire set of lines on both the old and new files
+        while a_line < self.a_data.length or b_line < self.b_data.length:
+            a_start = a_line
+            b_start = b_line
+
+            if a_line < self.a_data.length and \
+               not self.a_data.modified.get(a_line, False) and \
+               b_line < self.b_data.length and \
+               not self.b_data.modified.get(b_line, False):
+                # Equal
+                a_changed = b_changed = 1
+                tag = "equal"
+                a_line += 1
+                b_line += 1
+            else:
+                # Deleted, inserted or replaced
+
+                # Count every old line that's been modified, and the
+                # remainder of old lines if we've reached the end of the new
+                # file.
+                while a_line < self.a_data.length and \
+                      (b_line >= self.b_data.length or \
+                       self.a_data.modified.get(a_line, False)):
+                    a_line += 1
+
+                # Count every new line that's been modified, and the
+                # remainder of new lines if we've reached the end of the old
+                # file.
+                while b_line < self.b_data.length and \
+                      (a_line >= self.a_data.length or \
+                       self.b_data.modified.get(b_line, False)):
+                    b_line += 1
+
+                a_changed = a_line - a_start
+                b_changed = b_line - b_start
+
+                assert a_start < a_line or b_start < b_line
+                assert a_changed != 0 or b_changed != 0
+
+                if a_changed == 0 and b_changed > 0:
+                    tag = "insert"
+                elif a_changed > 0 and b_changed == 0:
+                    tag = "delete"
+                elif a_changed > 0 and b_changed > 0:
+                    tag = "replace"
+
+                    if a_changed != b_changed:
+                        if a_changed > b_changed:
+                            a_line -= a_changed - b_changed
+                        elif a_changed < b_changed:
+                            b_line -= b_changed - a_changed
+
+                        a_changed = b_changed = min(a_changed, b_changed)
+
+            if last_group and last_group[0] == tag:
+                last_group = (tag,
+                              last_group[1], last_group[2] + a_changed,
+                              last_group[3], last_group[4] + b_changed)
+            else:
+                if last_group:
+                    yield last_group
+
+                last_group = (tag, a_start, a_start + a_changed,
+                              b_start, b_start + b_changed)
+
+
+        if not last_group:
+            last_group = ("equal", 0, self.a_data.length, 0, self.b_data.length)
+
+        yield last_group
+
+    def _gen_diff_data(self):
+        """
+        Generate all the diff data needed to return opcodes or the diff ratio.
+        This is only called once during the liftime of a MyersDiffer instance.
+        """
+        if self.a_data and self.b_data:
+            return
+
+        self.a_data = self.DiffData(self._gen_diff_codes(self.a, False))
+        self.b_data = self.DiffData(self._gen_diff_codes(self.b, True))
+
+        self._discard_confusing_lines()
+
+        self.max_lines = self.a_data.undiscarded_lines + \
+                         self.b_data.undiscarded_lines + 3
+
+        vector_size = self.a_data.undiscarded_lines + \
+                      self.b_data.undiscarded_lines + 3
+        self.fdiag = [0] * vector_size
+        self.bdiag = [0] * vector_size
+        self.downoff = self.upoff = self.b_data.undiscarded_lines + 1
+
+        self._lcs(0, self.a_data.undiscarded_lines,
+                  0, self.b_data.undiscarded_lines,
+                  self.minimal_diff)
+        self._shift_chunks(self.a_data, self.b_data)
+        self._shift_chunks(self.b_data, self.a_data)
+
+    def _gen_diff_codes(self, lines, is_modified_file):
+        """
+        Converts all unique lines of text into unique numbers. Comparing
+        lists of numbers is faster than comparing lists of strings.
+        """
+        codes = []
+
+        linenum = 0
+
+        if is_modified_file:
+            interesting_lines = self.interesting_lines[1]
+        else:
+            interesting_lines = self.interesting_lines[0]
+
+        for line in lines:
+            # TODO: Handle ignoring/triming spaces, ignoring casing, and
+            #       special hooks
+
+            raw_line = line
+            stripped_line = line.lstrip()
+
+            if self.ignore_space:
+                # We still want to show lines that contain only whitespace.
+                if len(stripped_line) > 0:
+                    line = stripped_line
+
+            interesting_line_name = None
+
+            try:
+                code = self.code_table[line]
+                interesting_line_name = \
+                    self.interesting_line_table.get(code, None)
+            except KeyError:
+                # This is a new, unrecorded line, so mark it and store it.
+                self.last_code += 1
+                code = self.last_code
+                self.code_table[line] = code
+
+                # Check to see if this is an interesting line that the caller
+                # wants recorded.
+                if stripped_line:
+                    for name, regex in self.interesting_line_regexes:
+                        if regex.match(raw_line):
+                            interesting_line_name = name
+                            self.interesting_line_table[code] = name
+                            break
+
+            if interesting_line_name:
+                interesting_lines[interesting_line_name].append((linenum,
+                                                                 raw_line))
+
+            codes.append(code)
+
+            linenum += 1
+
+        return codes
+
+    def _find_sms(self, a_lower, a_upper, b_lower, b_upper, find_minimal):
+        """
+        Finds the Shortest Middle Snake.
+        """
+        down_vector = self.fdiag # The vector for the (0, 0) to (x, y) search
+        up_vector   = self.bdiag # The vector for the (u, v) to (N, M) search
+
+        down_k = a_lower - b_lower # The k-line to start the forward search
+        up_k   = a_upper - b_upper # The k-line to start the reverse search
+        odd_delta = (down_k - up_k) % 2 != 0
+
+        down_vector[self.downoff + down_k] = a_lower
+        up_vector[self.upoff + up_k] = a_upper
+
+        dmin = a_lower - b_upper
+        dmax = a_upper - b_lower
+
+        down_min = down_max = down_k
+        up_min   = up_max   = up_k
+
+        cost = 0
+        max_cost = max(256, self._very_approx_sqrt(self.max_lines * 4))
+
+        while True:
+            cost += 1
+            big_snake = False
+
+            if down_min > dmin:
+                down_min -= 1
+                down_vector[self.downoff + down_min - 1] = -1
+            else:
+                down_min += 1
+
+            if down_max < dmax:
+                down_max += 1
+                down_vector[self.downoff + down_max + 1] = -1
+            else:
+                down_max -= 1
+
+            # Extend the forward path
+            for k in xrange(down_max, down_min - 1, -2):
+                tlo = down_vector[self.downoff + k - 1]
+                thi = down_vector[self.downoff + k + 1]
+
+                if tlo >= thi:
+                    x = tlo + 1
+                else:
+                    x = thi
+
+                y = x - k
+                old_x = x
+
+                # Find the end of the furthest reaching forward D-path in
+                # diagonal k
+                while x < a_upper and y < b_upper and \
+                      self.a_data.undiscarded[x] == self.b_data.undiscarded[y]:
+                    x += 1
+                    y += 1
+
+                if odd_delta and up_min <= k <= up_max and \
+                   up_vector[self.upoff + k] <= x:
+                    return x, y, True, True
+
+                if x - old_x > self.SNAKE_LIMIT:
+                    big_snake = True
+
+                down_vector[self.downoff + k] = x
+
+            # Extend the reverse path
+            if up_min > dmin:
+                up_min -= 1
+                up_vector[self.upoff + up_min - 1] = self.max_lines
+            else:
+                up_min += 1
+
+            if up_max < dmax:
+                up_max += 1
+                up_vector[self.upoff + up_max + 1] = self.max_lines
+            else:
+                up_max -= 1
+
+            for k in xrange(up_max, up_min - 1, -2):
+                tlo = up_vector[self.upoff + k - 1]
+                thi = up_vector[self.upoff + k + 1]
+
+                if tlo < thi:
+                    x = tlo
+                else:
+                    x = thi - 1
+
+                y = x - k
+                old_x = x
+
+                while x > a_lower and y > b_lower and \
+                      self.a_data.undiscarded[x - 1] == \
+                      self.b_data.undiscarded[y - 1]:
+                    x -= 1
+                    y -= 1
+
+                if not odd_delta and down_min <= k <= down_max and \
+                   x <= down_vector[self.downoff + k]:
+                    return x, y, True, True
+
+                if old_x - x > self.SNAKE_LIMIT:
+                    big_snake = True
+
+                up_vector[self.upoff + k] = x
+
+            if find_minimal:
+                continue
+
+            # Heuristics courtesy of GNU diff.
+            #
+            # We check occasionally for a diagonal that made lots of progress
+            # compared with the edit distance. If we have one, find the one
+            # that made the most progress and return it.
+            #
+            # This gives us better, more dense chunks, instead of lots of
+            # small ones often starting with replaces. It also makes the output
+            # closer to that of GNU diff, which more people would expect.
+
+            if cost > 200 and big_snake:
+                ret_x, ret_y, best = \
+                    self._find_diagonal(down_min, down_max, down_k, 0,
+                                        self.downoff, down_vector,
+                                        lambda x: x - a_lower,
+                                        lambda x: a_lower + self.SNAKE_LIMIT <=
+                                                  x < a_upper,
+                                        lambda y: b_lower + self.SNAKE_LIMIT <=
+                                                  y < b_upper,
+                                        lambda i,k: i - k,
+                                        1, cost)
+
+                if best > 0:
+                    return ret_x, ret_y, True, False
+
+                ret_x, ret_y, best = \
+                    self._find_diagonal(up_min, up_max, up_k, best, self.upoff,
+                                        up_vector,
+                                        lambda x: a_upper - x,
+                                        lambda x: a_lower < x <= a_upper -
+                                                  self.SNAKE_LIMIT,
+                                        lambda y: b_lower < y <= b_upper -
+                                                  self.SNAKE_LIMIT,
+                                        lambda i,k: i + k,
+                                        0, cost)
+
+                if best > 0:
+                    return ret_x, ret_y, False, True
+
+            continue # XXX
+
+            # If we've reached or gone past the max cost, just give up now
+            # and report the halfway point between our best results.
+            if cost >= max_cost:
+                fx_best = bx_best = 0
+
+                # Find the forward diagonal that maximized x + y
+                fxy_best = -1
+                for d in xrange(down_max, down_min - 1, -2):
+                    x = min(down_vector[self.downoff + d], a_upper)
+                    y = x - d
+
+                    if b_upper < y:
+                        x = b_upper + d
+                        y = b_upper
+
+                    if fxy_best < x + y:
+                        fxy_best = x + y
+                        fx_best = x
+
+                # Find the backward diagonal that minimizes x + y
+                bxy_best = self.max_lines
+                for d in xrange(up_max, up_min - 1, -2):
+                    x = max(a_lower, up_vector[self.upoff + d])
+                    y = x - d
+
+                    if y < b_lower:
+                        x = b_lower + d
+                        y = b_lower
+
+                    if x + y < bxy_best:
+                        bxy_best = x + y
+                        bx_best = x
+
+                # Use the better of the two diagonals
+                if a_upper + b_upper - bxy_best < \
+                   fxy_best - (a_lower + b_lower):
+                    return fx_best, fxy_best - fx_best, True, False
+                else:
+                    return bx_best, bxy_best - bx_best, False, True
+
+
+        raise Exception("The function should not have reached here.")
+
+    def _find_diagonal(self, minimum, maximum, k, best, diagoff, vector,
+                       vdiff_func, check_x_range, check_y_range,
+                       discard_index, k_offset, cost):
+        for d in xrange(maximum, minimum - 1, -2):
+            dd = d - k
+            x = vector[diagoff + d]
+            y = x - d
+            v = vdiff_func(x) * 2 + dd
+
+            if v > 12 * (cost + abs(dd)):
+                if v > best and \
+                   check_x_range(x) and check_y_range(y):
+                    # We found a sufficient diagonal.
+                    k = k_offset
+                    x_index = discard_index(x, k)
+                    y_index = discard_index(y, k)
+
+                    while self.a_data.undiscarded[x_index] == \
+                          self.b_data.undiscarded[y_index]:
+                        if k == self.SNAKE_LIMIT - 1 + k_offset:
+                            return x, y, v
+
+                        k += 1
+        return 0, 0, 0
+
+    def _lcs(self, a_lower, a_upper, b_lower, b_upper, find_minimal):
+        """
+        The divide-and-conquer implementation of the Longest Common
+        Subsequence (LCS) algorithm.
+        """
+        # Fast walkthrough equal lines at the start
+        while a_lower < a_upper and b_lower < b_upper and \
+              self.a_data.undiscarded[a_lower] == \
+              self.b_data.undiscarded[b_lower]:
+            a_lower += 1
+            b_lower += 1
+
+        while a_upper > a_lower and b_upper > b_lower and \
+              self.a_data.undiscarded[a_upper - 1] == \
+              self.b_data.undiscarded[b_upper - 1]:
+            a_upper -= 1
+            b_upper -= 1
+
+        if a_lower == a_upper:
+            # Inserted lines.
+            while b_lower < b_upper:
+                self.b_data.modified[self.b_data.real_indexes[b_lower]] = True
+                b_lower += 1
+        elif b_lower == b_upper:
+            # Deleted lines
+            while a_lower < a_upper:
+                self.a_data.modified[self.a_data.real_indexes[a_lower]] = True
+                a_lower += 1
+        else:
+            # Find the middle snake and length of an optimal path for A and B
+            x, y, low_minimal, high_minimal = \
+                self._find_sms(a_lower, a_upper, b_lower, b_upper,
+                               find_minimal)
+
+            self._lcs(a_lower, x, b_lower, y, low_minimal)
+            self._lcs(x, a_upper, y, b_upper, high_minimal)
+
+    def _shift_chunks(self, data, other_data):
+        """
+        Shifts the inserts/deletes of identical lines in order to join
+        the changes together a bit more. This has the effect of cleaning
+        up the diff.
+
+        Often times, a generated diff will have two identical lines before
+        and after a chunk (say, a blank line). The default algorithm will
+        insert at the front of that range and include two blank lines at the
+        end, but that does not always produce the best looking diff. Since
+        the two lines are identical, we can shift the chunk so that the line
+        appears both before and after the line, rather than only after.
+        """
+        i = j = 0
+        i_end = data.length
+
+        while True:
+            # Scan forward in order to find the start of a run of changes.
+            while i < i_end and not data.modified.get(i, False):
+                i += 1
+
+                while other_data.modified.get(j, False):
+                    j += 1
+
+            if i == i_end:
+                return;
+
+            start = i
+
+            # Find the end of these changes
+            i += 1
+            while data.modified.get(i, False):
+                i += 1
+
+            while other_data.modified.get(j, False):
+                j += 1
+
+            while True:
+                run_length = i - start
+
+                # Move the changed chunks back as long as the previous
+                # unchanged line matches the last changed line.
+                # This merges with the previous changed chunks.
+                while start != 0 and data.data[start - 1] == data.data[i - 1]:
+                    start -= 1
+                    i -= 1
+
+                    data.modified[start] = True
+                    data.modified[i] = False
+
+                    while data.modified.get(start - 1, False):
+                        start -= 1
+
+                    j -= 1
+                    while other_data.modified.get(j, False):
+                        j -= 1
+
+                # The end of the changed run at the last point where it
+                # corresponds to the changed run in the other data set.
+                # If it's equal to i_end, then we didn't find a corresponding
+                # point.
+                if other_data.modified.get(j - 1, False):
+                    corresponding = i
+                else:
+                    corresponding = i_end
+
+                # Move the changed region forward as long as the first
+                # changed line is the same as the following unchanged line.
+                while i != i_end and data.data[start] == data.data[i]:
+                    data.modified[start] = False
+                    data.modified[i] = True
+
+                    start += 1
+                    i += 1
+
+                    while data.modified.get(i, False):
+                        i += 1
+
+                    j += 1
+                    while other_data.modified.get(j, False):
+                        j += 1
+                        corresponding = i
+
+                if run_length == i - start:
+                    break
+
+            # Move the fully-merged run back to a corresponding run in the
+            # other data set, if we can.
+            while corresponding < i:
+                start -= 1
+                i -= 1
+
+                data.modified[start] = True
+                data.modified[i] = False
+
+                j -= 1
+                while other_data.modified.get(j, False):
+                    j -= 1
+
+    def _discard_confusing_lines(self):
+        def build_discard_list(data, discards, counts):
+            many = 5 * self._very_approx_sqrt(data.length / 64)
+
+            for i, item in enumerate(data.data):
+                if item != 0:
+                    num_matches = counts[item]
+
+                    if num_matches == 0:
+                        discards[i] = self.DISCARD_FOUND
+                    elif num_matches > many:
+                        discards[i] = self.DISCARD_CANCEL
+
+        def scan_run(discards, i, length, index_func):
+            consec = 0
+
+            for j in xrange(length):
+                index = index_func(i, j)
+                discard = discards[index]
+
+                if j >= 8 and discard == self.DISCARD_FOUND:
+                    break
+
+                if discard == self.DISCARD_FOUND:
+                    consec += 1
+                else:
+                    consec = 0
+
+                    if discard == self.DISCARD_CANCEL:
+                        discards[index] = self.DISCARD_NONE
+
+                if consec == 3:
+                    break
+
+        def check_discard_runs(data, discards):
+            i = 0
+            while i < data.length:
+                # Cancel the provisional discards that are not in the middle
+                # of a run of discards
+                if discards[i] == self.DISCARD_CANCEL:
+                    discards[i] = self.DISCARD_NONE
+                elif discards[i] == self.DISCARD_FOUND:
+                    # We found a provisional discard
+                    provisional = 0
+
+                    # Find the end of this run of discardable lines and count
+                    # how many are provisionally discardable.
+                    #for j in xrange(i, data.length):
+                    j = i
+                    while j < data.length:
+                        if discards[j] == self.DISCARD_NONE:
+                            break
+                        elif discards[j] == self.DISCARD_CANCEL:
+                            provisional += 1
+                        j += 1
+
+                    # Cancel the provisional discards at the end and shrink
+                    # the run.
+                    while j > i and discards[j - 1] == self.DISCARD_CANCEL:
+                        j -= 1
+                        discards[j] = 0
+                        provisional -= 1
+
+                    length = j - i
+
+                    # If 1/4 of the lines are provisional, cancel discarding
+                    # all the provisional lines in the run.
+                    if provisional * 4 > length:
+                        while j > i:
+                            j -= 1
+                            if discards[j] == self.DISCARD_CANCEL:
+                                discards[j] = self.DISCARD_NONE
+                    else:
+                        minimum = 1 + self._very_approx_sqrt(length / 4)
+                        j = 0
+                        consec = 0
+                        while j < length:
+                            if discards[i + j] != self.DISCARD_CANCEL:
+                                consec = 0
+                            else:
+                                consec += 1
+                                if minimum == consec:
+                                    j -= consec
+                                elif minimum < consec:
+                                    discards[i + j] = self.DISCARD_NONE
+
+                            j += 1
+
+                        scan_run(discards, i, length, lambda x,y: x + y)
+                        i += length - 1
+                        scan_run(discards, i, length, lambda x,y: x - y)
+
+                i += 1
+
+        def discard_lines(data, discards):
+            j = 0
+            for i, item in enumerate(data.data):
+                if self.minimal_diff or discards[i] == self.DISCARD_NONE:
+                    data.undiscarded[j] = item
+                    data.real_indexes[j] = i
+                    j += 1
+                else:
+                    data.modified[i] = True
+
+            data.undiscarded_lines = j
+
+
+        self.a_data.undiscarded = [0] * self.a_data.length
+        self.b_data.undiscarded = [0] * self.b_data.length
+        self.a_data.real_indexes = [0] * self.a_data.length
+        self.b_data.real_indexes = [0] * self.b_data.length
+        a_discarded = [0] * self.a_data.length
+        b_discarded = [0] * self.b_data.length
+        a_code_counts = [0] * (1 + self.last_code)
+        b_code_counts = [0] * (1 + self.last_code)
+
+        for item in self.a_data.data:
+            a_code_counts[item] += 1
+
+        for item in self.b_data.data:
+            b_code_counts[item] += 1
+
+        build_discard_list(self.a_data, a_discarded, b_code_counts)
+        build_discard_list(self.b_data, b_discarded, a_code_counts)
+
+        check_discard_runs(self.a_data, a_discarded)
+        check_discard_runs(self.b_data, b_discarded)
+
+        discard_lines(self.a_data, a_discarded)
+        discard_lines(self.b_data, b_discarded)
+
+
+    def _very_approx_sqrt(self, i):
+        result = 1
+        i /= 4
+        while i > 0:
+            i /= 4
+            result *= 2
+
+        return result
+
+ALPHANUM_RE = re.compile(r'\w')
+WHITESPACE_RE = re.compile(r'\s')
+
+def get_line_changed_regions(oldline, newline):
+    if oldline is None or newline is None:
+        return (None, None)
+
+    # Use the SequenceMatcher directly. It seems to give us better results
+    # for this. We should investigate steps to move to the new differ.
+    differ = SequenceMatcher(None, oldline, newline)
+
+    # This thresholds our results -- we don't want to show inter-line diffs if
+    # most of the line has changed, unless those lines are very short.
+
+    # FIXME: just a plain, linear threshold is pretty crummy here.  Short
+    # changes in a short line get lost.  I haven't yet thought of a fancy
+    # nonlinear test.
+    if differ.ratio() < 0.6:
+        return (None, None)
+
+    oldchanges = []
+    newchanges = []
+    back = (0, 0)
+
+    for tag, i1, i2, j1, j2 in differ.get_opcodes():
+        if tag == "equal":
+            if (i2 - i1 < 3) or (j2 - j1 < 3):
+                back = (j2 - j1, i2 - i1)
+            continue
+
+        oldstart, oldend = i1 - back[0], i2
+        newstart, newend = j1 - back[1], j2
+
+        if oldchanges != [] and oldstart <= oldchanges[-1][1] < oldend:
+            oldchanges[-1] = (oldchanges[-1][0], oldend)
+        elif not oldline[oldstart:oldend].isspace():
+            oldchanges.append((oldstart, oldend))
+
+        if newchanges != [] and newstart <= newchanges[-1][1] < newend:
+            newchanges[-1] = (newchanges[-1][0], newend)
+        elif not newline[newstart:newend].isspace():
+            newchanges.append((newstart, newend))
+
+        back = (0, 0)
+
+    return (oldchanges, newchanges)
+
+def get_chunks(a, b):
+    def diff_line(vlinenum, oldlinenum, newlinenum, oldline, newline):
+        # This function accesses the variable meta, defined in an outer context.
+        if oldline and newline and oldline != newline:
+            oldregion, newregion = get_line_changed_regions(oldline, newline)
+        else:
+            oldregion = newregion = []
+
+        is_whitespace = (oldlinenum, newlinenum) in meta['whitespace_lines']
+
+        result = [vlinenum,
+                  oldlinenum, newlinenum,
+                  oldregion, newregion,
+                  is_whitespace]
+
+        if oldlinenum and oldlinenum in meta.get('moved', {}):
+            destination = meta["moved"][oldlinenum]
+            result.append(destination)
+        elif newlinenum and newlinenum in meta.get('moved', {}):
+            destination = meta["moved"][newlinenum]
+            result.append(destination)
+
+        return result
+
+    def new_chunk(lines, start, end, collapsable=False,
+                  tag='equal', meta=None):
+        if not meta:
+            meta = {}
+
+        return {
+            'lines': lines[start:end],
+            'numlines': end - start,
+            'change': tag,
+            'collapsable': collapsable,
+            'meta': meta,
+        }
+
+    a_num_lines = len(a)
+    b_num_lines = len(b)
+
+    linenum = 1
+
+    ignore_space = True
+
+    differ = MyersDiffer(a, b, ignore_space=ignore_space)
+
+    context_num_lines = 3
+    collapse_threshold = 2 * context_num_lines + 3
+
+    for tag, i1, i2, j1, j2, meta in opcodes_with_metadata(differ):
+        numlines = max(i2-i1, j2-j1)
+
+        lines = map(diff_line,
+                    xrange(linenum, linenum + numlines),
+                    xrange(i1 + 1, i2 + 1), xrange(j1 + 1, j2 + 1),
+                    a[i1:i2], b[j1:j2])
+
+        if tag == 'equal' and numlines > collapse_threshold:
+            last_range_start = numlines - context_num_lines
+
+            if linenum == 1:
+                yield new_chunk(lines, 0, last_range_start, True)
+                yield new_chunk(lines, last_range_start, numlines)
+            else:
+                yield new_chunk(lines, 0, context_num_lines)
+
+                if i2 == a_num_lines and j2 == b_num_lines:
+                    yield new_chunk(lines, context_num_lines, numlines, True)
+                else:
+                    yield new_chunk(lines, context_num_lines,
+                                    last_range_start, True)
+                    yield new_chunk(lines, last_range_start, numlines)
+        else:
+            yield new_chunk(lines, 0, numlines, False, tag, meta)
+
+        linenum += numlines
+
+def is_valid_move_range(lines):
+    """Determines if a move range is valid and should be included.
+
+    This performs some tests to try to eliminate trivial changes that
+    shouldn't have moves associated.
+
+    Specifically, a move range is valid if it has at least one line
+    with alpha-numeric characters and is at least 4 characters long when
+    stripped.
+    """
+    for line in lines:
+        line = line.strip()
+
+        if len(line) >= 4 and ALPHANUM_RE.search(line):
+            return True
+
+    return False
+
+
+def opcodes_with_metadata(differ):
+    """Returns opcodes from the differ with extra metadata.
+
+    This is a wrapper around a differ's get_opcodes function, which returns
+    extra metadata along with each range. That metadata includes information
+    on moved blocks of code and whitespace-only lines.
+
+    This returns a list of opcodes as tuples in the form of
+    (tag, i1, i2, j1, j2, meta).
+    """
+    groups = []
+    removes = {}
+    inserts = []
+
+    for tag, i1, i2, j1, j2 in differ.get_opcodes():
+        meta = {
+            # True if this chunk is only whitespace.
+            "whitespace_chunk": False,
+
+            # List of tuples (x,y), with whitespace changes.
+            "whitespace_lines": [],
+        }
+
+        if tag == 'replace':
+            # replace groups are good for whitespace only changes.
+            assert (i2 - i1) == (j2 - j1)
+
+            for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
+                if (WHITESPACE_RE.sub("", differ.a[i]) ==
+                    WHITESPACE_RE.sub("", differ.b[j])):
+                    # Both original lines are equal when removing all
+                    # whitespace, so include their original line number in
+                    # the meta dict.
+                    meta["whitespace_lines"].append((i + 1, j + 1))
+
+            # If all lines are considered to have only whitespace change,
+            # the whole chunk is considered a whitespace-only chunk.
+            if len(meta["whitespace_lines"]) == (i2 - i1):
+                meta["whitespace_chunk"] = True
+
+        group = (tag, i1, i2, j1, j2, meta)
+        groups.append(group)
+
+        # Store delete/insert ranges for later lookup. We will be building
+        # keys that in most cases will be unique for the particular block
+        # of text being inserted/deleted. There is a chance of collision,
+        # so we store a list of matching groups under that key.
+        #
+        # Later, we will loop through the keys and attempt to find insert
+        # keys/groups that match remove keys/groups.
+        if tag == 'delete':
+            for i in xrange(i1, i2):
+                line = differ.a[i].strip()
+
+                if line:
+                    removes.setdefault(line, []).append((i, group))
+        elif tag == 'insert':
+            inserts.append(group)
+
+    # We now need to figure out all the moved locations.
+    #
+    # At this point, we know all the inserted groups, and all the individually
+    # deleted lines. We'll be going through and finding consecutive groups
+    # of matching inserts/deletes that represent a move block.
+    #
+    # The algorithm will be documented as we go in the code.
+    #
+    # We start by looping through all the inserted groups.
+    for itag, ii1, ii2, ij1, ij2, imeta in inserts:
+        # Store some state on the range we'll be working with inside this
+        # insert group.
+        #
+        # i_move_cur is the current location inside the insert group
+        # (from ij1 through ij2).
+        #
+        # i_move_range is the current range of consecutive lines that we'll
+        # use for a move. Each line in this range has a corresponding
+        # consecutive delete line.
+        #
+        # r_move_ranges represents deleted move ranges. The key is a
+        # string in the form of "{i1}-{i2}-{j1}-{j2}", with those positions
+        # taken from the remove group for the line. The value
+        # is an array of tuples of (r_start, r_end, r_group). These values
+        # are used to quickly locate deleted lines we've found that match
+        # the inserted lines, so we can assemble ranges later.
+        i_move_cur = ij1
+        i_move_range = (i_move_cur, i_move_cur)
+        r_move_ranges = {} # key -> [(start, end, group)]
+
+        # Loop through every location from ij1 through ij2 until we've
+        # reached the end.
+        while i_move_cur <= ij2:
+            try:
+                iline = differ.b[i_move_cur].strip()
+            except IndexError:
+                iline = None
+
+            if iline is not None and iline in removes:
+                # The inserted line at this location has a corresponding
+                # removed line.
+                #
+                # If there's already some information on removed line ranges
+                # for this particular move block we're processing then we'll
+                # update the range.
+                #
+                # The way we do that is to find each removed line that
+                # matches this inserted line, and for each of those find
+                # out if there's an existing move range that the found
+                # removed line immediately follows. If there is, we update
+                # the existing range.
+                #
+                # If there isn't any move information for this line, we'll
+                # simply add it to the move ranges.
+                for ri, rgroup in removes.get(iline, []):
+                    key = "%s-%s-%s-%s" % rgroup[1:5]
+
+                    if r_move_ranges:
+                        for i, r_move_range in \
+                            enumerate(r_move_ranges.get(key, [])):
+                            # If the remove information for the line is next in
+                            # the sequence for this calculated move range...
+                            if ri == r_move_range[1] + 1:
+                                r_move_ranges[key][i] = (r_move_range[0], ri,
+                                                         rgroup)
+                                break
+                    else:
+                        # We don't have any move ranges yet, so it's time to
+                        # build one based on any removed lines we find that
+                        # match the inserted line.
+                        r_move_ranges[key] = [(ri, ri, rgroup)]
+
+                # On to the next line in the sequence...
+                i_move_cur += 1
+            else:
+                # We've reached the very end of the insert group. See if
+                # we have anything that looks like a move.
+                if r_move_ranges:
+                    r_move_range = None
+
+                    # Go through every range of lines we've found and
+                    # find the longest.
+                    #
+                    # The longest move range wins. If we find two ranges that
+                    # are equal, though, we'll ignore both. The idea is that
+                    # if we have two identical moves, then it's probably
+                    # common enough code that we don't want to show the move.
+                    # An example might be some standard part of a comment
+                    # block, with no real changes in content.
+                    #
+                    # Note that with the current approach, finding duplicate
+                    # moves doesn't cause us to reset the winning range
+                    # to the second-highest identical match. We may want to
+                    # do that down the road, but it means additional state,
+                    # and this is hopefully uncommon enough to not be a real
+                    # problem.
+                    for ranges in r_move_ranges.itervalues():
+                        for r1, r2, rgroup in ranges:
+                            if not r_move_range:
+                                r_move_range = (r1, r2, rgroup)
+                            else:
+                                len1 = r_move_range[2] - r_move_range[1]
+                                len2 = r2 - r1
+
+                                if len1 < len2:
+                                    r_move_range = (r1, r2, rgroup)
+                                elif len1 == len2:
+                                    # If there are two that are the same, it
+                                    # may be common code that we don't want to
+                                    # see moves for. Comments, for example.
+                                    r_move_range = None
+
+                    # If we have a move range, see if it's one we want to
+                    # include or filter out. Some moves are not impressive
+                    # enough to display. For example, a small portion of a
+                    # comment, or whitespace-only changes.
+                    if (r_move_range and
+                        is_valid_move_range(
+                            differ.a[r_move_range[0]:r_move_range[1]])):
+
+                        # Rebuild the insert and remove ranges based on
+                        # where we are now and which range we won.
+                        #
+                        # The new ranges will be actual lists of positions,
+                        # rather than a beginning and end. These will be
+                        # provided to the renderer.
+                        #
+                        # The ranges expected by the renderers are 1-based,
+                        # whereas our calculations for this algorithm are
+                        # 0-based, so we add 1 to the numbers.
+                        #
+                        # The upper boundaries passed to the range() function
+                        # must actually be one higher than the value we want.
+                        # So, for r_move_range, we actually increment by 2.
+                        # We only increment i_move_cur by one, because
+                        # i_move_cur already factored in the + 1 by being
+                        # at the end of the while loop.
+                        i_move_range = range(i_move_range[0] + 1,
+                                             i_move_cur + 1)
+                        r_move_range = range(r_move_range[0] + 1,
+                                             r_move_range[1] + 2)
+
+                        rmeta = rgroup[-1]
+                        rmeta.setdefault('moved', {}).update(
+                            dict(zip(r_move_range, i_move_range)))
+                        imeta.setdefault('moved', {}).update(
+                            dict(zip(i_move_range, r_move_range)))
+
+                # Reset the state for the next range.
+                i_move_cur += 1
+                i_move_range = (i_move_cur, i_move_cur)
+                r_move_ranges = {}
+
+    return groups
+
+def split_lines(raw):
+    if raw and raw[-1] != '\n':
+        raw += '\n'
+
+    lines = NEWLINES_RE.split(raw or '')
+
+    # Remove the trailing newline, now that we've split this. This will
+    # prevent a duplicate line number at the end of the diff.
+    del(lines[-1])
+
+    return lines
+
+def _test(oldfile, newfile):
+    old, new = open(oldfile, 'r'), open(newfile, 'r')
+    a, b = split_lines(old.read()), split_lines(new.read())
+
+    chunks = list(get_chunks(a, b))
+    old.close()
+    new.close()
+
+    return chunks
+
+def main():
+    import pprint
+    import sys
+
+    pprint.pprint(_test(sys.argv[1], sys.argv[2]))
+
+if __name__ == "__main__":
+    main()
diff --git a/sweettooth/review/diffview.py b/sweettooth/review/diffview.py
new file mode 100644
index 0000000..b6fc431
--- /dev/null
+++ b/sweettooth/review/diffview.py
@@ -0,0 +1,98 @@
+
+import pygments.formatters
+
+from review.diffutils import get_chunks, split_lines
+
+# Stolen from ReviewBoard
+# See the top of diffutils.py for details
+class NoWrapperHtmlFormatter(pygments.formatters.HtmlFormatter):
+    """An HTML Formatter for Pygments that don't wrap items in a div."""
+
+    def _wrap_div(self, inner):
+        # Method called by the formatter to wrap the contents of inner.
+        # Inner is a list of tuples containing formatted code. If the first item
+        # in the tuple is zero, then it's a wrapper, so we should ignore it.
+        for tup in inner:
+            if tup[0]:
+                yield tup
+
+REGION = u'<span class="%s">%%s</span>'
+
+UNCHANGED_REGION = REGION % (u'inline unchanged',)
+CHANGED_REGION = REGION % (u'inline changed',)
+
+EQUAL_REGION = REGION % (u'line equal',)
+INSERTED_REGION = REGION % (u'line inserted',)
+DELETED_REGION = REGION % (u'line deleted',)
+REPLACED_REGION = REGION % (u'line replaced',)
+
+def get_line_region_markup(line, regions):
+    if regions == []:
+        return [UNCHANGED_REGION % escape(line)]
+
+    parts = []
+    last_end = 0
+    for start, end in regions:
+        parts.append(UNCHANGED_REGION % line[last_end:start])
+        parts.append(CHANGED_REGION % line[start:end])
+        last_end = end
+
+    parts.append(UNCHANGED_REGION % line[last_end:len(line)])
+    return parts
+
+def get_equal_markup(chunk, old, new):
+    lines = chunk['lines']
+    start = lines[0]
+    end = lines[-1]
+    contents = old[start[1]-1:end[1]]
+    markup = [EQUAL_REGION % (L,) for L in contents]
+    return markup, markup
+
+def get_inserted_markup(chunk, old, new):
+    lines = chunk['lines']
+    start = lines[0]
+    end = lines[-1]
+    contents = new[start[2]-1:end[2]]
+    return None, [INSERTED_REGION % (L,) for L in contents]
+
+def get_deleted_markup(chunk, old, new):
+    lines = chunk['lines']
+    start = lines[0]
+    end = lines[-1]
+    contents = old[start[1]-1:end[1]]
+    return [DELETED_REGION % (L,) for L in contents], None
+
+def get_replaced_markup(chunk, old, new):
+    lines = chunk['lines']
+
+    oldlines, newlines = [], []
+    for line in lines:
+        oldcontent = old[line[1] - 1]
+        newcontent = new[line[2] - 1]
+        oldregion = line[3]
+        newregion = line[4]
+        oldlines.append(REPLACED_REGION % \
+                            (''.join(get_line_region_markup(oldcontent, oldregion)),))
+        newlines.append(REPLACED_REGION % \
+                            (''.join(get_line_region_markup(newcontent, newregion)),))
+
+    return oldlines, newlines
+
+OPCODES = dict(equal   = get_equal_markup,
+               insert  = get_inserted_markup,
+               delete  = get_deleted_markup,
+               replace = get_replaced_markup)
+
+def get_chunks_html(old, new):
+    oldchunks, newchunks = [], []
+    for chunk in get_chunks(old, new):
+        opcode = chunk['change']
+        oldchunk, newchunk = OPCODES[opcode](chunk, old, new)
+
+        if oldchunk is not None:
+            oldchunks.extend(oldchunk)
+
+        if newchunk is not None:
+            newchunks.extend(newchunk)
+
+    return oldchunks, newchunks
diff --git a/sweettooth/review/htmldiff.py b/sweettooth/review/htmldiff.py
new file mode 100644
index 0000000..af5ee62
--- /dev/null
+++ b/sweettooth/review/htmldiff.py
@@ -0,0 +1,70 @@
+
+# ./review $ PYTHONPATH=.. python htmldiff.py old new > diff.html
+
+import sys
+
+import pygments.util
+import pygments.lexers
+import pygments.formatters
+
+from django.utils.html import escape
+
+from review.diffview import get_chunks_html, split_lines, NoWrapperHtmlFormatter
+
+FORMATTER = NoWrapperHtmlFormatter(style="borland", cssclass="code")
+
+def highlight_file(filename, raw):
+
+    try:
+        lexer = pygments.lexers.guess_lexer_for_filename(filename, raw)
+    except pygments.util.ClassNotFound:
+        # released pygments doesn't yet have .json
+        # so hack around it here.
+        if filename.endswith('.json'):
+            lexer = pygments.lexers.get_lexer_by_name('js')
+        else:
+            lexer = pygments.lexers.get_lexer_by_name('text')
+
+    return pygments.highlight(raw, lexer, FORMATTER)
+
+def htmldiff(oldfile, newfile):
+    old, new = open(oldfile, 'r'), open(newfile, 'r')
+    oldcontent, newcontent = old.read(), new.read()
+    old.close()
+    new.close()
+
+    oldmarkup = highlight_file(oldfile, oldcontent)
+    newmarkup = highlight_file(newfile, newcontent)
+
+    oldlines = split_lines(oldmarkup)
+    newlines = split_lines(newmarkup)
+
+    old_htmls, new_htmls = get_chunks_html(oldlines, newlines)
+    old_html, new_html = '\n'.join(old_htmls), '\n'.join(new_htmls)
+
+    return """
+<!doctype html>
+<html>
+  <head>
+    <style>
+    %s
+.code { white-space: pre; }
+.code .deleted { background-color: red; }
+.code .inserted { background-color: green; }
+.code .inline.changed { background-color: lightgreen; }
+    </style>
+  </head>
+  <body>
+    <table width="100%%"><tr>
+      <td width="50%%"><code class="code">%s</code></td>
+      <td width="50%%"><code class="code">%s</code></td>
+    </tr></table>
+  </body>
+</html>
+""" % (FORMATTER.get_style_defs(), old_html, new_html)
+
+def main():
+    print htmldiff(sys.argv[1], sys.argv[2])
+
+if __name__ == "__main__":
+    main()
diff --git a/sweettooth/review/urls.py b/sweettooth/review/urls.py
index 2ef6c17..1017530 100644
--- a/sweettooth/review/urls.py
+++ b/sweettooth/review/urls.py
@@ -12,6 +12,7 @@ urlpatterns = patterns('',
         name='review-list'),
     url(r'^ajax/get-files/(?P<pk>\d+)', views.ajax_get_files_view, name='review-ajax-files'),
     url(r'^ajax/get-file-list/(?P<pk>\d+)', views.ajax_get_file_list_view, name='review-ajax-file-list'),
+    url(r'^ajax/get-file-diff/(?P<pk>\d+)', views.ajax_get_file_diff_view, name='review-ajax-file-diff'),
     url(r'^submit/(?P<pk>\d+)', views.submit_review_view, name='review-submit'),
 
     url(r'^(?P<pk>\d+)', views.review_version_view, name='review-version'),
diff --git a/sweettooth/review/views.py b/sweettooth/review/views.py
index 0794a36..bdaaf0b 100644
--- a/sweettooth/review/views.py
+++ b/sweettooth/review/views.py
@@ -11,10 +11,12 @@ from django.conf import settings
 from django.core.mail import send_mail
 from django.core.urlresolvers import reverse
 from django.contrib import messages
-from django.http import HttpResponseForbidden
+from django.http import HttpResponseForbidden, Http404
 from django.shortcuts import redirect
 from django.template.loader import render_to_string
+from django.utils.html import escape
 
+from review.diffview import get_chunks_html, split_lines, NoWrapperHtmlFormatter
 from review.models import CodeReview, ChangeStatusLog, get_all_reviewers
 from extensions import models
 
@@ -30,7 +32,8 @@ IMAGE_TYPES = {
     '.svg':  'image/svg+xml',
 }
 
-FORMATTER = pygments.formatters.HtmlFormatter(style="borland", cssclass="code")
+code_formatter = pygments.formatters.HtmlFormatter(style="borland", cssclass="code")
+diff_formatter = NoWrapperHtmlFormatter(style="borland")
 
 def can_review_extension(user, extension):
     if user == extension.creator:
@@ -50,7 +53,7 @@ def can_approve_extension(user, extension):
 
     return False
 
-def highlight_file(filename, raw):
+def highlight_file(filename, raw, formatter):
     try:
         lexer = pygments.lexers.guess_lexer_for_filename(filename, raw)
     except pygments.util.ClassNotFound:
@@ -61,7 +64,7 @@ def highlight_file(filename, raw):
         else:
             lexer = pygments.lexers.get_lexer_by_name('text')
 
-    return pygments.highlight(raw, lexer, FORMATTER)
+    return pygments.highlight(raw, lexer, formatter)
 
 def html_for_file(filename, raw):
     base, extension = os.path.splitext(filename)
@@ -76,9 +79,36 @@ def html_for_file(filename, raw):
                     num_lines=0)
 
     else:
-        return dict(html=highlight_file(filename, raw),
+        return dict(html=highlight_file(filename, raw, code_formatter),
                     num_lines=len(raw.strip().splitlines()))
 
+def get_zipfiles(version):
+    extension = version.extension
+
+    new_zipfile = version.get_zipfile('r')
+    old_zipfile = extension.latest_version.get_zipfile('r')
+
+    return old_zipfile, new_zipfile
+
+def get_diff(old_zipfile, new_zipfile, filename, highlight):
+    old, new = old_zipfile.open(filename, 'r'), new_zipfile.open(filename, 'r')
+    oldcontent, newcontent = old.read(), new.read()
+    old.close()
+    new.close()
+
+    if highlight:
+        oldmarkup = highlight_file(filename, oldcontent, diff_formatter)
+        newmarkup = highlight_file(filename, newcontent, diff_formatter)
+    else:
+        oldmarkup = escape(oldcontent)
+        newmarkup = escape(newcontent)
+
+    oldlines = split_lines(oldmarkup)
+    newlines = split_lines(newmarkup)
+
+    old_htmls, new_htmls = get_chunks_html(oldlines, newlines)
+    return '\n'.join(old_htmls), '\n'.join(new_htmls)
+
 @ajax_view
 @model_view(models.ExtensionVersion)
 def ajax_get_file_list_view(request, obj):
@@ -87,10 +117,7 @@ def ajax_get_file_list_view(request, obj):
     if not can_review_extension(request.user, extension):
         return HttpResponseForbidden()
 
-    version, extension = obj, obj.extension
-
-    new_zipfile = version.get_zipfile('r')
-    old_zipfile = extension.latest_version.get_zipfile('r')
+    old_zipfile, new_zipfile = get_zipfiles(version)
 
     new_filelist = set(new_zipfile.namelist())
     old_filelist = set(old_zipfile.namelist())
@@ -105,6 +132,39 @@ def ajax_get_file_list_view(request, obj):
 
 @ajax_view
 @model_view(models.ExtensionVersion)
+def ajax_get_file_diff_view(request, obj):
+    version, extension = obj, obj.extension
+
+    if not can_review_extension(request.user, extension):
+        return HttpResponseForbidden()
+
+    old_zipfile, new_zipfile = get_zipfiles(version)
+
+    filename = request.GET['filename']
+    highlight = request.GET.get('highlight', True)
+
+    new_filelist = set(new_zipfile.namelist())
+    old_filelist = set(old_zipfile.namelist())
+
+    diff = None
+
+    if filename in old_filelist:
+        if filename in new_filelist:
+            operation = 'both'
+            diff = get_diff(old_zipfile, new_zipfile,
+                            filename, highlight)
+        else:
+            operation = 'deleted'
+    elif filename in new_namelist:
+        operation = 'added'
+    else:
+        raise Http404()
+
+    return dict(operation=operation,
+                diff=diff)
+
+ ajax_view
+ model_view(models.ExtensionVersion)
 def ajax_get_files_view(request, obj):
     if not can_review_extension(request.user, obj.extension):
         return HttpResponseForbidden()



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]