[meld] Inline highlighting performance improvements
- From: Kai Willadsen <kaiw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [meld] Inline highlighting performance improvements
- Date: Sun, 20 May 2012 00:34:17 +0000 (UTC)
commit f387feb5a9240b2f0538ac51215e62d6c3d8ea1d
Author: Piotr Piastucki <leech miranda gmail com>
Date: Sun May 20 10:01:03 2012 +1000
Inline highlighting performance improvements
The default preprocessing in MyersSequenceMatcher removes all items
that do not appear in the other file. It works pretty well for lines
which are long and unique. However, it fails for character-based
matching, as both files usually use the same alphabet.
This patch adds a new matcher class called InlineMyersSequenceMatcher
which provides additional optimization based on the assumption that a
single character-based chunk must be at least 3 characters long.
meld/filediff.py | 2 +-
meld/matchers.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+), 1 deletions(-)
---
diff --git a/meld/filediff.py b/meld/filediff.py
index 1eaaee0..4f5f7ab 100644
--- a/meld/filediff.py
+++ b/meld/filediff.py
@@ -60,7 +60,7 @@ class CachedSequenceMatcher(object):
self.cache[(text1, textn)][1] = time.time()
return self.cache[(text1, textn)][0]
except KeyError:
- matcher = matchers.MyersSequenceMatcher(None, text1, textn)
+ matcher = matchers.InlineMyersSequenceMatcher(None, text1, textn)
opcodes = matcher.get_opcodes()
self.cache[(text1, textn)] = [opcodes, time.time()]
return opcodes
diff --git a/meld/matchers.py b/meld/matchers.py
index 9182b4e..f5ed1bd 100644
--- a/meld/matchers.py
+++ b/meld/matchers.py
@@ -288,3 +288,76 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
self.build_matching_blocks(lastsnake, snakes)
self.postprocess()
yield 1
+
+class InlineMyersSequenceMatcher(MyersSequenceMatcher):
+
+ def preprocess_discard_nonmatching_lines(self, a, b):
+ aindex = self.aindex = {}
+ bindex = self.bindex = {}
+ n = len(a)
+ m = len(b)
+ if m > 2 and n > 2:
+ a2 = []
+ b2 = []
+ aset = set()
+ bset = set()
+ for i in range(n - 2):
+ aset.add((a[i], a[i+1], a[i+2]))
+ for i in range(m - 2):
+ bset.add((b[i], b[i+1], b[i+2]))
+ j = 0
+ c_2 = None
+ c_1 = None
+ matched_2 = False
+ matched_1 = False
+ for i, c in enumerate(b):
+ if (c_2, c_1, c) in aset:
+ if not matched_2:
+ b2.append(c_2)
+ bindex[j] = i - 2
+ j += 1
+ if not matched_1:
+ b2.append(c_1)
+ bindex[j] = i - 1
+ j += 1
+ b2.append(c)
+ bindex[j] = i
+ j += 1
+ matched_2 = matched_1 = True
+ else:
+ matched_2 = matched_1
+ matched_1 = False
+ c_2 = c_1
+ c_1 = c
+
+ k = 0
+ c_2 = None
+ c_1 = None
+ matched_2 = False
+ matched_1 = False
+ for i, c in enumerate(a):
+ if (c_2, c_1, c) in bset:
+ if not matched_2:
+ a2.append(c_2)
+ aindex[k] = i - 2
+ k += 1
+ if not matched_1:
+ a2.append(c_1)
+ aindex[k] = i - 1
+ k += 1
+ a2.append(c)
+ aindex[k] = i
+ k += 1
+ matched_2 = matched_1 = True
+ else:
+ matched_2 = matched_1
+ matched_1 = False
+ c_2 = c_1
+ c_1 = c
+ # We only use the optimised result if it's worthwhile. The constant
+ # represents a heuristic of how many lines constitute 'worthwhile'.
+ self.lines_discarded = m - j > 10 or n - k > 10
+ if self.lines_discarded:
+ a = a2
+ b = b2
+ return (a, b)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]