[meld] Inline highlighting performance improvements

From: Kai Willadsen <kaiw src gnome org>
To: commits-list gnome org
Cc:
Subject: [meld] Inline highlighting performance improvements
Date: Sun, 20 May 2012 00:34:17 +0000 (UTC)
commit f387feb5a9240b2f0538ac51215e62d6c3d8ea1d
Author: Piotr Piastucki <leech miranda gmail com>
Date:   Sun May 20 10:01:03 2012 +1000

    Inline highlighting performance improvements
    
    The default preprocessing in MyersSequenceMatcher removes all items
    that do not appear in the other file. It works pretty well for lines
    which are long and unique. However, it fails for character-based
    matching, as both files usually use the same alphabet.
    
    This patch adds a new matcher class called InlineMyersSequenceMatcher
    which provides additional optimization based on the assumption that a
    single character-based chunk must be at least 3 characters long.

 meld/filediff.py |    2 +-
 meld/matchers.py |   73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 1 deletions(-)
---
diff --git a/meld/filediff.py b/meld/filediff.py
index 1eaaee0..4f5f7ab 100644
--- a/meld/filediff.py
+++ b/meld/filediff.py
@@ -60,7 +60,7 @@ class CachedSequenceMatcher(object):
             self.cache[(text1, textn)][1] = time.time()
             return self.cache[(text1, textn)][0]
         except KeyError:
-            matcher = matchers.MyersSequenceMatcher(None, text1, textn)
+            matcher = matchers.InlineMyersSequenceMatcher(None, text1, textn)
             opcodes = matcher.get_opcodes()
             self.cache[(text1, textn)] = [opcodes, time.time()]
             return opcodes
diff --git a/meld/matchers.py b/meld/matchers.py
index 9182b4e..f5ed1bd 100644
--- a/meld/matchers.py
+++ b/meld/matchers.py
@@ -288,3 +288,76 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
         self.build_matching_blocks(lastsnake, snakes)
         self.postprocess()
         yield 1
+
+class InlineMyersSequenceMatcher(MyersSequenceMatcher):
+    
+    def preprocess_discard_nonmatching_lines(self, a, b):
+        aindex = self.aindex = {}
+        bindex = self.bindex = {}
+        n = len(a)
+        m = len(b)
+        if m > 2 and n > 2:
+            a2 = []
+            b2 = []
+            aset = set()
+            bset = set()
+            for i in range(n - 2):
+                aset.add((a[i], a[i+1], a[i+2]))
+            for i in range(m - 2):
+                bset.add((b[i], b[i+1], b[i+2]))
+            j = 0
+            c_2 = None
+            c_1 = None
+            matched_2 = False
+            matched_1 = False
+            for i, c in enumerate(b):
+                if (c_2, c_1, c) in aset:
+                    if not matched_2:
+                        b2.append(c_2)
+                        bindex[j] = i - 2
+                        j += 1
+                    if not matched_1:
+                        b2.append(c_1)
+                        bindex[j] = i - 1
+                        j += 1
+                    b2.append(c)
+                    bindex[j] = i
+                    j += 1
+                    matched_2 = matched_1 = True
+                else:
+                    matched_2 = matched_1
+                    matched_1 = False
+                c_2 = c_1
+                c_1 = c
+
+            k = 0
+            c_2 = None
+            c_1 = None
+            matched_2 = False
+            matched_1 = False
+            for i, c in enumerate(a):
+                if (c_2, c_1, c) in bset:
+                    if not matched_2:
+                        a2.append(c_2)
+                        aindex[k] = i - 2
+                        k += 1
+                    if not matched_1:
+                        a2.append(c_1)
+                        aindex[k] = i - 1
+                        k += 1
+                    a2.append(c)
+                    aindex[k] = i
+                    k += 1
+                    matched_2 = matched_1 = True
+                else:
+                    matched_2 = matched_1
+                    matched_1 = False
+                c_2 = c_1
+                c_1 = c
+            # We only use the optimised result if it's worthwhile. The constant
+            # represents a heuristic of how many lines constitute 'worthwhile'.
+            self.lines_discarded = m - j > 10 or n - k > 10
+            if self.lines_discarded:
+                a = a2
+                b = b2
+        return (a, b)
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]