[meld] Simplified version of the k-mer version of the Myers matcher



commit 81cc819bca57e442a3a219e42f98fd0a58a74af4
Author: Kai Willadsen <kai willadsen gmail com>
Date:   Mon May 7 09:50:38 2012 +1000

    Simplified version of the k-mer version of the Myers matcher

 meld/matchers.py |   97 +++++++++++++++++-------------------------------------
 1 files changed, 30 insertions(+), 67 deletions(-)
---
diff --git a/meld/matchers.py b/meld/matchers.py
index f5ed1bd..fc40f3e 100644
--- a/meld/matchers.py
+++ b/meld/matchers.py
@@ -292,72 +292,35 @@ class MyersSequenceMatcher(difflib.SequenceMatcher):
 class InlineMyersSequenceMatcher(MyersSequenceMatcher):
     
     def preprocess_discard_nonmatching_lines(self, a, b):
-        aindex = self.aindex = {}
-        bindex = self.bindex = {}
-        n = len(a)
-        m = len(b)
-        if m > 2 and n > 2:
-            a2 = []
-            b2 = []
-            aset = set()
-            bset = set()
-            for i in range(n - 2):
-                aset.add((a[i], a[i+1], a[i+2]))
-            for i in range(m - 2):
-                bset.add((b[i], b[i+1], b[i+2]))
-            j = 0
-            c_2 = None
-            c_1 = None
-            matched_2 = False
-            matched_1 = False
-            for i, c in enumerate(b):
-                if (c_2, c_1, c) in aset:
-                    if not matched_2:
-                        b2.append(c_2)
-                        bindex[j] = i - 2
-                        j += 1
-                    if not matched_1:
-                        b2.append(c_1)
-                        bindex[j] = i - 1
-                        j += 1
-                    b2.append(c)
-                    bindex[j] = i
-                    j += 1
-                    matched_2 = matched_1 = True
-                else:
-                    matched_2 = matched_1
-                    matched_1 = False
-                c_2 = c_1
-                c_1 = c
 
-            k = 0
-            c_2 = None
-            c_1 = None
-            matched_2 = False
-            matched_1 = False
-            for i, c in enumerate(a):
-                if (c_2, c_1, c) in bset:
-                    if not matched_2:
-                        a2.append(c_2)
-                        aindex[k] = i - 2
-                        k += 1
-                    if not matched_1:
-                        a2.append(c_1)
-                        aindex[k] = i - 1
-                        k += 1
-                    a2.append(c)
-                    aindex[k] = i
-                    k += 1
-                    matched_2 = matched_1 = True
-                else:
-                    matched_2 = matched_1
-                    matched_1 = False
-                c_2 = c_1
-                c_1 = c
-            # We only use the optimised result if it's worthwhile. The constant
-            # represents a heuristic of how many lines constitute 'worthwhile'.
-            self.lines_discarded = m - j > 10 or n - k > 10
-            if self.lines_discarded:
-                a = a2
-                b = b2
+        if len(a) <= 2 and len(b) <= 2:
+            self.aindex = []
+            self.bindex = []
+            return (a, b)
+
+        def index_matching_kmers(a, b):
+            aset = set([a[i:i+3] for i in range(len(a) - 2)])
+            matches, index = [], []
+            next_poss_match = 0
+            # Start from where we can get a valid triple
+            for i in range(2, len(b)):
+                if b[i - 2:i + 1] not in aset:
+                    continue
+                # Make sure we don't re-record matches from overlapping kmers
+                for j in range(max(next_poss_match, i - 2), i + 1):
+                    matches.append(b[j])
+                    index.append(j)
+                next_poss_match = i + 1
+            return matches, index
+
+        indexed_b, self.bindex = index_matching_kmers(a, b)
+        indexed_a, self.aindex = index_matching_kmers(b, a)
+
+        # We only use the optimised result if it's worthwhile. The constant
+        # represents a heuristic of how many lines constitute 'worthwhile'.
+        self.lines_discarded = len(b) - len(indexed_b) > 10 or \
+                               len(a) - len(indexed_a) > 10
+        if self.lines_discarded:
+            a = indexed_a
+            b = indexed_b
         return (a, b)



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]