[meld] Simplified version of the k-mer version of the Myers matcher
- From: Kai Willadsen <kaiw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [meld] Simplified version of the k-mer version of the Myers matcher
- Date: Sun, 20 May 2012 00:34:22 +0000 (UTC)
commit 81cc819bca57e442a3a219e42f98fd0a58a74af4
Author: Kai Willadsen <kai willadsen gmail com>
Date: Mon May 7 09:50:38 2012 +1000
Simplified version of the k-mer version of the Myers matcher
meld/matchers.py | 97 +++++++++++++++++-------------------------------------
1 files changed, 30 insertions(+), 67 deletions(-)
---
diff --git a/meld/matchers.py b/meld/matchers.py
index f5ed1bd..fc40f3e 100644
--- a/meld/matchers.py
+++ b/meld/matchers.py
@@ -292,72 +292,35 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
class InlineMyersSequenceMatcher(MyersSequenceMatcher):
def preprocess_discard_nonmatching_lines(self, a, b):
- aindex = self.aindex = {}
- bindex = self.bindex = {}
- n = len(a)
- m = len(b)
- if m > 2 and n > 2:
- a2 = []
- b2 = []
- aset = set()
- bset = set()
- for i in range(n - 2):
- aset.add((a[i], a[i+1], a[i+2]))
- for i in range(m - 2):
- bset.add((b[i], b[i+1], b[i+2]))
- j = 0
- c_2 = None
- c_1 = None
- matched_2 = False
- matched_1 = False
- for i, c in enumerate(b):
- if (c_2, c_1, c) in aset:
- if not matched_2:
- b2.append(c_2)
- bindex[j] = i - 2
- j += 1
- if not matched_1:
- b2.append(c_1)
- bindex[j] = i - 1
- j += 1
- b2.append(c)
- bindex[j] = i
- j += 1
- matched_2 = matched_1 = True
- else:
- matched_2 = matched_1
- matched_1 = False
- c_2 = c_1
- c_1 = c
- k = 0
- c_2 = None
- c_1 = None
- matched_2 = False
- matched_1 = False
- for i, c in enumerate(a):
- if (c_2, c_1, c) in bset:
- if not matched_2:
- a2.append(c_2)
- aindex[k] = i - 2
- k += 1
- if not matched_1:
- a2.append(c_1)
- aindex[k] = i - 1
- k += 1
- a2.append(c)
- aindex[k] = i
- k += 1
- matched_2 = matched_1 = True
- else:
- matched_2 = matched_1
- matched_1 = False
- c_2 = c_1
- c_1 = c
- # We only use the optimised result if it's worthwhile. The constant
- # represents a heuristic of how many lines constitute 'worthwhile'.
- self.lines_discarded = m - j > 10 or n - k > 10
- if self.lines_discarded:
- a = a2
- b = b2
+ if len(a) <= 2 and len(b) <= 2:
+ self.aindex = []
+ self.bindex = []
+ return (a, b)
+
+ def index_matching_kmers(a, b):
+ aset = set([a[i:i+3] for i in range(len(a) - 2)])
+ matches, index = [], []
+ next_poss_match = 0
+ # Start from where we can get a valid triple
+ for i in range(2, len(b)):
+ if b[i - 2:i + 1] not in aset:
+ continue
+ # Make sure we don't re-record matches from overlapping kmers
+ for j in range(max(next_poss_match, i - 2), i + 1):
+ matches.append(b[j])
+ index.append(j)
+ next_poss_match = i + 1
+ return matches, index
+
+ indexed_b, self.bindex = index_matching_kmers(a, b)
+ indexed_a, self.aindex = index_matching_kmers(b, a)
+
+ # We only use the optimised result if it's worthwhile. The constant
+ # represents a heuristic of how many lines constitute 'worthwhile'.
+ self.lines_discarded = len(b) - len(indexed_b) > 10 or \
+ len(a) - len(indexed_a) > 10
+ if self.lines_discarded:
+ a = indexed_a
+ b = indexed_b
return (a, b)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]