[kupfer] relevance: rewrite formatCommonSubstrings

From: Ulrik Sverdrup <usverdrup src gnome org>
To: svn-commits-list gnome org
Cc:
Subject: [kupfer] relevance: rewrite formatCommonSubstrings
Date: Sun, 13 Sep 2009 01:11:33 +0000 (UTC)
commit 1988d322ece5acc4c54570edb5ed7477dbabeef3
Author: Ulrik Sverdrup <ulrik sverdrup gmail com>
Date:   Sun Sep 13 00:22:28 2009 +0200

    relevance: rewrite formatCommonSubstrings

 kupfer/relevance.py |  101 ++++++++++++++++++++++++---------------------------
 1 files changed, 48 insertions(+), 53 deletions(-)
---
diff --git a/kupfer/relevance.py b/kupfer/relevance.py
index 36494a6..3ec01e1 100644
--- a/kupfer/relevance.py
+++ b/kupfer/relevance.py
@@ -24,64 +24,59 @@ from __future__ import division
 
 """
 This module provides relevance matching and formatting of related strings
-based on the relevance.  It is borrowed from Gnome-Do with modifications
-to fit nicely into python.
-
->>> import relevance
->>> print relevance.score('hi there dude', 'hi dude')
-0.745628698225
->>> relevance.formatCommonSubstrings('hi there dude', 'hi dude')
-'<b>hi </b>there <b>dude</b>'
+based on the relevance.  It originates in Gnome-Do.
+
+Module updated by Ulrik to use python unicode functions, proper python
+idoms and looping, replacing costly operations in python with better ones,
+all to clean up and dramatically speed up the code.
 """
 
-def formatCommonSubstrings(main, other, format = '<b>%s</b>'):
+def formatCommonSubstrings(s, query, format_clean=None, format_match=None):
     """
-    Creates a new string using @format to highlight matching substrings
-    of @other in @main.
-    
-    Returns: a formatted str
-    
-    >>> formatCommonSubstrings('hi there dude', 'hi dude')
-    '<b>hi </b>there <b>dude</b>'
+    Creates a new string highlighting matching substrings.
+
+    Returns: a formatted string
+
+    >>> formatCommonSubstrings('hi there dude', 'hidude',
+    ...                        format_match=lambda m: "<b>%s</b>" % m)
+    '<b>hi</b> there <b>dude</b>'
+
+    >>> formatCommonSubstrings('parallelism', 'lsm', format_match=str.upper)
+    'paralleLiSM'
     """
-    length = 0
-    result = ''
-    match_pos = last_main_cut = 0
-    lower_main = main.lower()
-    other = other.lower()
-    
-    for pos in range(len(other)):
-        matchedTermination = False
-        for length in range(1, 1 + len(other) - pos + 1):
-            tmp_match_pos  = lower_main.find(other[pos:pos + length])
-            if tmp_match_pos < 0:
-                length -= 1
-                matchedTermination = False
-                break
-            else:
-                matchedTermination = True
-                match_pos = tmp_match_pos
-        if matchedTermination:
-            length -= 1
-        if 0 < length:
-            # There is a match starting at match_pos with positive length
-            skipped = main[last_main_cut:match_pos - last_main_cut]
-            matched = main[match_pos:match_pos + length]
-            if len(skipped) + len(matched) < len(main):
-                remainder = formatCommonSubstrings(
-                    main[match_pos + length:],
-                    other[pos + length:],
-                    format)
-            else:
-                remainder = ''
-            result = '%s%s%s' % (skipped, format % matched, remainder)
+    format_clean = format_clean or (lambda x: x)
+    format_match = format_match or (lambda x: x)
+
+    if not query:
+        return format_clean(s)
+
+    ls = s.lower()
+
+    # find overall range of match
+    first, last = _findBestMatch(ls, query)
+
+    if first == -1:
+        return format_clean(s)
+
+    # find longest perfect match, put in slc
+    for slc in xrange(len(query), 0, -1):
+        if query[:slc] == ls[first:first+slc]:
             break
-    
-    if result == '':
-        # No matches
-        result = main
-    
-    return result
+    key, nextkey = query[:slc], query[slc:]
+
+    head = s[:first]
+    match = s[first: first+slc]
+    matchtail = s[first+slc: last]
+    tail = s[last:]
+
+    # we use s[0:0], which is "" or u""
+    return s[0:0].join((
+            format_clean(head),
+            format_match(match),
+            formatCommonSubstrings(matchtail, nextkey,
+                                   format_clean, format_match),
+            format_clean(tail),
+            ))
 
 def score(s, query):
     """
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]