fantasdic r380 - in trunk: . lib lib/fantasdic lib/fantasdic/sources lib/fantasdic/text



Author: mblondel
Date: Tue Jan  6 17:13:11 2009
New Revision: 380
URL: http://svn.gnome.org/viewvc/fantasdic?rev=380&view=rev

Log:
    * lib/fantasdic/text/porter_stemming.rb: Support for stemming (e.g.
    stemming => stem).
    * lib/fantasdic/text/metaphone.rb: Algorithm to compare the
    pronunciation of two strings.
    * lib/fantasdic/text/double_metaphone.rb: Same.
    * lib/fantasdic/text/soundex.rb: Same.
    * lib/fantasdic/text/levenshtein.rb: Levenshtein distance.

    All those files are imported from http://rubyforge.org/projects/text.
    License is either public domain or Ruby license (GPL compatible)

    * lib/fantasdic/sources/dictd_file.rb: Support for the above methods + 
    Regexp.
    * lib/fantasdic/sources/stardict_file.rb: Same.

    * lib/fantasdic/file_source.rb: Some factoring.

    * lib/fantasdic.rb: Necessary "require"s.



Added:
   trunk/lib/fantasdic/text/
   trunk/lib/fantasdic/text/double_metaphone.rb
   trunk/lib/fantasdic/text/levenshtein.rb
   trunk/lib/fantasdic/text/metaphone.rb
   trunk/lib/fantasdic/text/porter_stemming.rb
   trunk/lib/fantasdic/text/soundex.rb
Modified:
   trunk/ChangeLog
   trunk/lib/fantasdic.rb
   trunk/lib/fantasdic/file_source.rb
   trunk/lib/fantasdic/sources/dictd_file.rb
   trunk/lib/fantasdic/sources/stardict_file.rb

Modified: trunk/lib/fantasdic.rb
==============================================================================
--- trunk/lib/fantasdic.rb	(original)
+++ trunk/lib/fantasdic.rb	Tue Jan  6 17:13:11 2009
@@ -134,13 +134,22 @@
 
 require 'fantasdic/config'
 require 'fantasdic/version'
-require 'fantasdic/preferences'
+
 require 'fantasdic/net/sockssocket'
 require 'fantasdic/net/dict'
-require 'fantasdic/utils'
-require 'fantasdic/command_line'
+
+require 'fantasdic/text/porter_stemming'
+require 'fantasdic/text/levenshtein'
+require 'fantasdic/text/soundex'
+require 'fantasdic/text/metaphone'
+require 'fantasdic/text/double_metaphone'
+
 require 'fantasdic/ui'
+
+require 'fantasdic/preferences'
+require 'fantasdic/command_line'
+require 'fantasdic/utils'
 require 'fantasdic/binary_search'
 require 'fantasdic/dictzip'
 require 'fantasdic/source_base'
-require 'fantasdic/file_source'
\ No newline at end of file
+require 'fantasdic/file_source'

Modified: trunk/lib/fantasdic/file_source.rb
==============================================================================
--- trunk/lib/fantasdic/file_source.rb	(original)
+++ trunk/lib/fantasdic/file_source.rb	Tue Jan  6 17:13:11 2009
@@ -18,6 +18,108 @@
 module Fantasdic
 module Source
 
+class DictionaryIndex < File
+    include FileBinarySearch
+
+    MAX_LEV_DISTANCE = 2
+
+    def match_exact(word)
+        match_binary_search(word) do |s1, s2|
+            s1 <=> s2
+        end
+    end
+
+    def match_prefix(word)
+        match_binary_search(word) do |s1, s2|
+            if s1 =~ /^#{s2}/
+                0
+            else
+                s1 <=> s2
+            end
+        end
+    end
+
+    def match_suffix(word)
+        get_word_list.find_all do |curr_word, offset, len|
+            curr_word =~ /#{word}$/
+        end
+    end
+
+    def match_substring(word)
+        get_word_list.find_all do |curr_word, offset, len|
+            curr_word.include?(word)
+        end
+    end
+
+    def match_word(word)
+        match_substring(word).find_all do |curr_word, offset, len|
+            ret = false
+            curr_word.split(" ").each do |single_word|
+                if single_word == word
+                    ret = true
+                    break
+                end
+            end
+            ret
+        end         
+    end
+
+    def match_stem(word)
+        match_prefix(word.stem)
+    end
+
+    def match_lev(word)
+        get_word_list.find_all do |curr_word, offset, len|
+            word.levenshtein(curr_word) < MAX_LEV_DISTANCE
+        end        
+    end
+
+    def match_soundex(word)
+        soundex = word.soundex
+        get_word_list.find_all do |curr_word, offset, len|
+            soundex == curr_word.soundex
+        end   
+    end
+
+    def match_metaphone(word)
+        metaphone = word.metaphone
+        get_word_list.find_all do |curr_word, offset, len|
+            metaphone == curr_word.metaphone
+        end   
+    end
+
+    def match_metaphone2(word)
+        def is_equal?(pair1, pair2)
+            pair1.each do |snd1|
+                next if not snd1
+                pair2.each do |snd2|
+                    next if not snd2
+                    return true if snd1 == snd2
+                end
+            end
+            return false
+        end
+
+        pair1 = word.double_metaphone
+        get_word_list.find_all do |curr_word, offset, len|
+            is_equal?(pair1, curr_word.double_metaphone)
+        end   
+    end
+
+    def match_regexp(regexp)
+        begin
+            r = Regexp.new(regexp)
+        rescue RegexpError
+            []
+        else
+            get_word_list.find_all do |curr_word, offset, len|
+                curr_word =~ r
+            end             
+        end
+    end
+
+end
+
 class FileSource < Base
 
     class ConfigWidget < Base::ConfigWidget

Modified: trunk/lib/fantasdic/sources/dictd_file.rb
==============================================================================
--- trunk/lib/fantasdic/sources/dictd_file.rb	(original)
+++ trunk/lib/fantasdic/sources/dictd_file.rb	Tue Jan  6 17:13:11 2009
@@ -20,8 +20,7 @@
 module Fantasdic
 module Source
 
-class DictdIndex < File
-    include FileBinarySearch
+class DictdIndex < DictionaryIndex
 
     B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".
           split(//)
@@ -113,22 +112,6 @@
         end        
     end
 
-    def match_exact(word)
-        match_binary_search(word) do |s1, s2|
-            s1 <=> s2
-        end
-    end
-
-    def match_prefix(word)
-        match_binary_search(word) do |s1, s2|
-            if s1 =~ /^#{s2}/
-                0
-            else
-                s1 <=> s2
-            end
-        end
-    end
-
     def match_suffix(word)
         word = Regexp.escape(word)
         self.grep(/#{word}\t/).map do |line|
@@ -147,22 +130,6 @@
         end        
     end
 
-    def match_word(word)
-        word = Regexp.escape(word)
-        self.grep(/#{word}/).map do |line|
-            DictdIndex.get_fields(line)
-        end.find_all do |curr_word, offset, len|
-            ret = false
-            curr_word.split(" ").each do |single_word|
-                if single_word == word
-                    ret = true
-                    break
-                end
-            end
-            ret
-        end         
-    end
-
     def get_word_list
         self.rewind
         self.lines.map { |line| DictdIndex.get_fields(line) }
@@ -182,9 +149,19 @@
     STRATEGIES_DESC = {
         "define" => "Results match with the word exactly.",
         "prefix" => "Results match with the beginning of the word.",
-        "word" => "Results have one word that match with the word.",
+        "word" => "Results have one word that matches with the word.",
         "substring" => "Results have a portion that contains the word.",
-        "suffix" => "Results match with the end of the word."
+        "suffix" => "Results match with the end of the word.",
+        "stem" => "Results share the same root as the word.",
+        "lev" => "Results are close to the word according to the " + \
+                 "levenshtein distance.",
+        "soundex" => "Results have similar pronunciation according " + \
+                     "to the soundex algorithm.",
+        "metaphone" => "Results have similar pronunciation according " + \
+                       "to the metaphone algorithm.",
+        "metaphone2" => "Results have similar pronunciation according " + \
+                       "to the double metaphone algorithm.",
+        "regexp" => "Results match the regular expression."
     }
 
     class ConfigWidget < FileSource::ConfigWidget
@@ -252,14 +229,12 @@
     end
 
     def match(db, strat, word)
-        matches = []
-
-        dictd_file_open do |index_file, dict_file|
-            matches = case strat
-                when "prefix", "suffix", "substring", "word"
-                    index_file.send("match_#{strat}", word)
-                else
-                    []
+        matches = dictd_file_open do |index_file, dict_file|
+            meth = "match_#{strat}"
+            if index_file.respond_to? meth
+                index_file.send(meth, word)
+            else
+                []
             end.map do |match, offset, len|
                 match
             end

Modified: trunk/lib/fantasdic/sources/stardict_file.rb
==============================================================================
--- trunk/lib/fantasdic/sources/stardict_file.rb	(original)
+++ trunk/lib/fantasdic/sources/stardict_file.rb	Tue Jan  6 17:13:11 2009
@@ -56,7 +56,7 @@
 
 end
 
-class StardictIndex < File
+class StardictIndex < DictionaryIndex
 
     OFFSET_INT_SIZE = 4
     LEN_INT_SIZE = 4
@@ -106,47 +106,6 @@
         found_offsets.map { |offset| self.get_fields(offset) }
     end
 
-    def match_exact(word)
-        match_binary_search(word) do |s1, s2|
-            s1 <=> s2
-        end
-    end
-
-    def match_prefix(word)
-        match_binary_search(word) do |s1, s2|
-            if s1 =~ /^#{s2}/
-                0
-            else
-                s1 <=> s2
-            end
-        end
-    end
-
-    def match_suffix(word)
-        get_word_list.find_all do |curr_word, offset, len|
-            curr_word =~ /#{word}$/
-        end
-    end
-
-    def match_substring(word)
-        get_word_list.find_all do |curr_word, offset, len|
-            curr_word.include?(word)
-        end
-    end
-
-    def match_word(word)
-        match_substring(word).find_all do |curr_word, offset, len|
-            ret = false
-            curr_word.split(" ").each do |single_word|
-                if single_word == word
-                    ret = true
-                    break
-                end
-            end
-            ret
-        end         
-    end
-
     # Returns the offsets of the beginning of each entry in the index
     def get_index_offsets
         self.rewind
@@ -199,9 +158,19 @@
     STRATEGIES_DESC = {
         "define" => "Results match with the word exactly.",
         "prefix" => "Results match with the beginning of the word.",
-        "word" => "Results have one word that match with the word.",
+        "word" => "Results have one word that matches with the word.",
         "substring" => "Results have a portion that contains the word.",
-        "suffix" => "Results match with the end of the word."
+        "suffix" => "Results match with the end of the word.",
+        "stem" => "Results share the same root as the word.",
+        "lev" => "Results are close to the word according to the " + \
+                 "levenshtein distance.",
+        "soundex" => "Results have similar pronunciation according " + \
+                     "to the soundex algorithm.",
+        "metaphone" => "Results have similar pronunciation according " + \
+                       "to the metaphone algorithm.",
+        "metaphone2" => "Results have similar pronunciation according " + \
+                       "to the double metaphone algorithm.",
+        "regexp" => "Results match the regular expression."
     }
 
     class ConfigWidget < FileSource::ConfigWidget
@@ -254,14 +223,12 @@
     end
 
     def match(db, strat, word)
-        matches = []
-
-        stardict_file_open do |index_file, dict_file, file_info|
-            matches = case strat
-                when "prefix", "suffix", "substring", "word"
-                    index_file.send("match_#{strat}", word)
-                else
-                    []
+        matches = stardict_file_open do |index_file, dict_file, file_info|
+            meth = "match_#{strat}"
+            if index_file.respond_to? meth
+                index_file.send(meth, word)
+            else
+                []
             end.map do |match, offset, len|
                 match
             end

Added: trunk/lib/fantasdic/text/double_metaphone.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/double_metaphone.rb	Tue Jan  6 17:13:11 2009
@@ -0,0 +1,360 @@
+#
+# Ruby implementation of the Double Metaphone algorithm by Lawrence Philips,
+# originally published in the June 2000 issue of C/C++ Users Journal.
+#
+# Based on Stephen Woodbridge's PHP version - http://swoodbridge.com/DoubleMetaPhone/
+#
+# Author: Tim Fletcher (twoggle gmail com)
+#
+
+module Text # :nodoc:
+module Metaphone
+
+  # Returns the primary and secondary double metaphone tokens
+  # (the secondary will be nil if equal to the primary).
+  def double_metaphone(str)
+    primary, secondary, current = [], [], 0
+    original, length, last = "#{str}     ".upcase, str.length, str.length - 1
+    if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
+      current += 1
+    end
+    if 'X' == original[0, 1]
+      primary << :S
+      secondary << :S
+      current += 1
+    end
+    while primary.length < 4 || secondary.length < 4
+      break if current > str.length
+      a, b, c = double_metaphone_lookup(original, current, length, last)
+      primary << a if a
+      secondary << b if b
+      current += c if c
+    end
+    primary, secondary = primary.to_s[0, 4], secondary.to_s[0, 4]
+    return primary, (primary == secondary ? nil : secondary)
+  end
+
+
+  private
+
+  def slavo_germanic?(str)
+    /W|K|CZ|WITZ/ =~ str
+  end
+
+  def vowel?(str)
+    /^A|E|I|O|U|Y$/ =~ str
+  end
+
+  def double_metaphone_lookup(str, pos, length, last)
+    case str[pos, 1]
+      when /^A|E|I|O|U|Y$/
+        if 0 == pos
+          return :A, :A, 1
+        else
+          return nil, nil, 1
+        end
+      when 'B'
+        return :P, :P, ('B' == str[pos + 1, 1] ? 2 : 1)
+      when 'Ã' 
+        return :S, :S, 1
+      when 'C'
+        if pos > 1 &&
+          !vowel?(str[pos - 2, 1]) &&
+          'ACH' == str[pos - 1, 3] &&
+          str[pos + 2, 1] != 'I' && (
+            str[pos + 2, 1] != 'E' ||
+            str[pos - 2, 6] =~ /^(B|M)ACHER$/
+          ) then
+          return :K, :K, 2
+        elsif 0 == pos && 'CAESAR' == str[pos, 6]
+          return :S, :S, 2
+        elsif 'CHIA' == str[pos, 4]
+          return :K, :K, 2
+        elsif 'CH' == str[pos, 2]
+          if pos > 0 && 'CHAE' == str[pos, 4]
+            return :K, :X, 2
+          elsif 0 == pos && (
+              ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
+              ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
+            ) && str[0, 5] != 'CHORE' then
+            return :K, :K, 2
+          elsif ['VAN ','VON '].include?(str[0, 4]) ||
+                'SCH' == str[0, 3] ||
+                ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
+                ['T','S'].include?(str[pos + 2, 1]) || (
+                  ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
+                  ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
+                ) then
+            return :K, :K, 2
+          elsif pos > 0
+            return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
+          else
+            return :X, :X, 2
+          end
+        elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
+          return :S, :X, 2
+        elsif 'CIA' == str[pos + 1, 3]
+          return :X, :X, 3
+        elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
+          if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
+            if (1 == pos && 'A' == str[pos - 1, 1]) ||
+              /^UCCE(E|S)$/ =~ str[pos - 1, 5] then
+              return :KS, :KS, 3
+            else
+              return :X, :X, 3
+            end
+          else
+            return :K, :K, 2
+          end
+        elsif /^C(K|G|Q)$/ =~ str[pos, 2]
+          return :K, :K, 2
+        elsif /^C(I|E|Y)$/ =~ str[pos, 2]
+          return :S, (/^CI(O|E|A)$/ =~ str[pos, 3] ? :X : :S), 2
+        else
+          if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
+            return :K, :K, 3
+          else     
+            return :K, :K, (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
+          end
+        end
+      when 'D'
+        if 'DG' == str[pos, 2]
+          if /^I|E|Y$/ =~ str[pos + 2, 1]
+            return :J, :J, 3
+          else
+            return :TK, :TK, 2
+          end
+        else
+          return :T, :T, (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
+        end
+      when 'F'
+        return :F, :F, ('F' == str[pos + 1, 1] ? 2 : 1)
+      when 'G'
+        if 'H' == str[pos + 1, 1]
+          if pos > 0 && !vowel?(str[pos - 1, 1])
+            return :K, :K, 2
+          elsif 0 == pos
+            if 'I' == str[pos + 2, 1]
+              return :J, :J, 2
+            else
+              return :K, :K, 2
+            end
+          elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
+                (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
+                (pos > 3 && /^B|H$/   =~ str[pos - 4, 1])
+            return nil, nil, 2
+          else
+            if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
+              return :F, :F, 2
+            elsif pos > 0 && 'I' != str[pos - 1, 1]
+              return :K, :K, 2
+            else
+              return nil, nil, 2
+            end
+          end
+        elsif 'N' == str[pos + 1, 1]
+          if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
+            return :KN, :N, 2
+          else
+            if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
+              return :N, :KN, 2
+            else
+              return :KN, :KN, 2
+            end
+          end
+        elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
+          return :KL, :L, 2
+        elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
+          return :K, :J, 2
+        elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
+               /^(D|R|M)ANGER$/ !~ str[0, 6] &&
+               /^E|I$/ !~ str[pos - 1, 1] &&
+               /^(R|O)GY$/ !~ str[pos - 1, 3])
+          return :K, :J, 2
+        elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
+          if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
+            return :K, :K, 2
+          else
+            if 'IER ' == str[pos + 1, 4]
+              return :J, :J, 2
+            else
+              return :J, :K, 2
+            end
+          end
+        elsif 'G' == str[pos + 1, 1]
+          return :K, :K, 2
+        else
+          return :K, :K, 1
+        end
+      when 'H'
+        if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
+          return :H, :H, 2
+        else
+          return nil, nil, 1
+        end
+      when 'J'
+        if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
+          if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
+            return :H, :H, 1
+          else
+            return :J, :H, 1
+          end
+        else
+          current = ('J' == str[pos + 1, 1] ? 2 : 1)
+          
+          if 0 == pos && 'JOSE' != str[pos, 4]
+            return :J, :A, current
+          else
+            if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
+              return :J, :H, current
+            else
+              if last == pos
+                return :J, nil, current
+              else
+                if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
+                  return :J, :J, current
+                else
+                  return nil, nil, current
+                end
+              end
+            end
+          end
+        end
+      when 'K'
+        return :K, :K, ('K' == str[pos + 1, 1] ? 2 : 1)
+      when 'L'
+        if 'L' == str[pos + 1, 1]
+          if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
+              ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
+            return :L, nil, 2
+          else
+            return :L, :L, 2
+          end
+        else
+          return :L, :L, 1
+        end
+      when 'M'
+        if ('UMB' == str[pos - 1, 3] &&
+            ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
+          return :M, :M, 2
+        else
+          return :M, :M, 1
+        end
+      when 'N'
+        return :N, :N, ('N' == str[pos + 1, 1] ? 2 : 1)
+      when 'Ã' 
+        return :N, :N, 1
+      when 'P'
+        if 'H' == str[pos + 1, 1]
+          return :F, :F, 2
+        else
+          return :P, :P, (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
+        end
+      when 'Q'
+        return :K, :K, ('Q' == str[pos + 1, 1] ? 2 : 1)
+      when 'R'
+        current = ('R' == str[pos + 1, 1] ? 2 : 1)
+        
+        if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
+          return nil, :R, current
+        else
+          return :R, :R, current
+        end
+      when 'S'
+        if /^(I|Y)SL$/ =~ str[pos - 1, 3]
+          return nil, nil, 1
+        elsif 0 == pos && 'SUGAR' == str[pos, 5]
+          return :X, :S, 1
+        elsif 'SH' == str[pos, 2]
+          if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
+            return :S, :S, 2
+          else
+            return :X, :X, 2
+          end
+        elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
+          return :S, (slavo_germanic?(str) ? :S : :X), 3
+        elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
+          return :S, :X, ('Z' == str[pos + 1, 1] ? 2 : 1)
+        elsif 'SC' == str[pos, 2]
+          if 'H' == str[pos + 2, 1]
+            if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
+              return (/^E(R|N)$/ =~ str[pos + 3, 2] ? :X : :SK), :SK, 3
+            else
+              return :X, ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? :S : :X), 3
+            end
+          elsif /^I|E|Y$/ =~ str[pos + 2, 1]
+            return :S, :S, 3
+          else
+            return :SK, :SK, 3
+          end
+        else
+          return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
+        end
+      when 'T'
+        if 'TION' == str[pos, 4]
+          return :X, :X, 3
+        elsif /^T(IA|CH)$/ =~ str[pos, 3]
+          return :X, :X, 3
+        elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
+          if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
+            return :T, :T, 2
+          else
+            return 0, :T, 2
+          end
+        else
+          return :T, :T, (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
+        end
+      when 'V'
+        return :F, :F, ('V' == str[pos + 1, 1] ? 2 : 1)
+      when 'W'
+        if 'WR' == str[pos, 2]
+          return :R, :R, 2
+        end
+        pri, sec = nil, nil
+
+        if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
+          pri = :A
+          sec = vowel?(str[pos + 1, 1]) ? :F : :A
+        end
+
+        if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
+            /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
+          return pri, "#{sec}F".intern, 1
+        elsif /^WI(C|T)Z$/ =~ str[pos, 4]
+          return "#{pri}TS".intern, "#{sec}FX".intern, 4
+        else
+          return pri, sec, 1
+        end
+      when 'X'
+        current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
+
+        if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
+          return :KS, :KS, current
+        else
+          return nil, nil, current
+        end
+      when 'Z'
+        if 'H' == str[pos + 1, 1]
+          return :J, :J, 2
+        else
+          current = ('Z' == str[pos + 1, 1] ? 2 : 1)
+
+          if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
+            return :S, :TS, current
+          else
+            return :S, :S, current
+          end
+        end
+      else
+        return nil, nil, 1
+    end
+  end # def double_metaphone_lookup
+
+  extend self
+
+end # module Metaphone
+end # module Text
+
+class String
+  def double_metaphone; Text::Metaphone.double_metaphone(self); end
+end

Added: trunk/lib/fantasdic/text/levenshtein.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/levenshtein.rb	Tue Jan  6 17:13:11 2009
@@ -0,0 +1,69 @@
+#
+# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
+#
+# The Levenshtein distance is a measure of how similar two strings s and t are,
+# calculated as the number of deletions/insertions/substitutions needed to
+# transform s into t. The greater the distance, the more the strings differ.
+#
+# The Levenshtein distance is also sometimes referred to as the
+# easier-to-pronounce-and-spell 'edit distance'.
+#
+# Author: Paul Battley (pbattley gmail com)
+#
+
+module Text # :nodoc:
+module Levenshtein
+
+  # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
+  # +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
+  # as ISO-8859-*.
+  #
+  # The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
+  # Otherwise, the comparison will be performed byte-by-byte. There is no specific support 
+  # for Shift-JIS or EUC strings.
+  #
+  # When using Unicode text, be aware that this algorithm does not perform normalisation. 
+  # If there is a possibility of different normalised forms being used, normalisation
+  # should be performed beforehand.
+  #
+  def distance(str1, str2)
+    if $KCODE =~ /^U/i
+      unpack_rule = 'U*'
+    else
+      unpack_rule = 'C*'
+    end
+    s = str1.unpack(unpack_rule)
+    t = str2.unpack(unpack_rule)
+    n = s.length
+    m = t.length
+    return m if (0 == n)
+    return n if (0 == m)
+  
+    d = (0..m).to_a
+    x = nil
+
+    (0...n).each do |i|
+      e = i+1
+      (0...m).each do |j|
+        cost = (s[i] == t[j]) ? 0 : 1
+        x = [
+          d[j+1] + 1, # insertion
+          e + 1,      # deletion
+          d[j] + cost # substitution
+        ].min
+        d[j] = e
+        e = x
+      end
+      d[m] = x
+    end
+
+    return x
+  end
+
+  extend self
+end
+end
+
+class String
+  def levenshtein(str); Text::Levenshtein.distance(self, str); end
+end
\ No newline at end of file

Added: trunk/lib/fantasdic/text/metaphone.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/metaphone.rb	Tue Jan  6 17:13:11 2009
@@ -0,0 +1,101 @@
+# 
+# An implementation of the Metaphone phonetic coding system in Ruby.
+# 
+# Metaphone encodes names into a phonetic form such that similar-sounding names
+# have the same or similar Metaphone encodings.
+# 
+# The original system was described by Lawrence Philips in Computer Language
+# Vol. 7 No. 12, December 1990, pp 39-43.
+# 
+# As there are multiple implementations of Metaphone, each with their own
+# quirks, I have based this on my interpretation of the algorithm specification.
+# Even LP's original BASIC implementation appears to contain bugs (specifically
+# with the handling of CC and MB), when compared to his explanation of the
+# algorithm.
+# 
+# I have also compared this implementation with that found in PHP's standard
+# library, which appears to mimic the behaviour of LP's original BASIC
+# implementation. For compatibility, these rules can also be used by passing
+# :buggy=>true to the methods.
+# 
+# Author: Paul Battley (pbattley gmail com)
+#
+
+module Text # :nodoc:
+module Metaphone
+
+  module Rules # :nodoc:all
+    
+    # Metaphone rules.  These are simply applied in order.
+    #
+    STANDARD = [ 
+      # Regexp, replacement
+      [ /([bcdfhjklmnpqrstvwxyz])\1+/,
+                         '\1' ],  # Remove doubled consonants except g.
+                                  # [PHP] remove c from regexp.
+      [ /^ae/,            'E' ],
+      [ /^[gkp]n/,        'N' ],
+      [ /^wr/,            'R' ],
+      [ /^x/,             'S' ],
+      [ /^wh/,            'W' ],
+      [ /mb$/,            'M' ],  # [PHP] remove $ from regexp.
+      [ /(?!^)sch/,      'SK' ],
+      [ /th/,             '0' ],
+      [ /t?ch|sh/,        'X' ],
+      [ /c(?=ia)/,        'X' ],
+      [ /[st](?=i[ao])/,  'X' ],
+      [ /s?c(?=[iey])/,   'S' ],
+      [ /[cq]/,           'K' ],
+      [ /dg(?=[iey])/,    'J' ],
+      [ /d/,              'T' ],
+      [ /g(?=h[^aeiou])/, ''  ],
+      [ /gn(ed)?/,        'N' ],
+      [ /([^g]|^)g(?=[iey])/,
+                        '\1J' ],
+      [ /g+/,             'K' ],
+      [ /ph/,             'F' ],
+      [ /([aeiou])h(?=\b|[^aeiou])/,
+                         '\1' ],
+      [ /[wy](?![aeiou])/, '' ],
+      [ /z/,              'S' ],
+      [ /v/,              'F' ],
+      [ /(?!^)[aeiou]+/,  ''  ],
+    ]
+  
+    # The rules for the 'buggy' alternate implementation used by PHP etc.
+    #
+    BUGGY = STANDARD.dup
+    BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
+    BUGGY[6] = [ /mb/, 'M' ]
+  end
+
+  # Returns the Metaphone representation of a string. If the string contains
+  # multiple words, each word in turn is converted into its Metaphone
+  # representation. Note that only the letters A-Z are supported, so any
+  # language-specific processing should be done beforehand.
+  #
+  # If the :buggy option is set, alternate 'buggy' rules are used.
+  #
+  def metaphone(str, options={})
+    return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
+  end
+  
+private
+
+  def metaphone_word(w, options={})
+    # Normalise case and remove non-ASCII
+    s = w.downcase.gsub(/[^a-z]/, '')
+    # Apply the Metaphone rules
+    rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
+    rules.each { |rx, rep| s.gsub!(rx, rep) }
+    return s.upcase
+  end
+
+  extend self
+
+end
+end
+
+class String
+  def metaphone; Text::Metaphone.metaphone(self); end
+end
\ No newline at end of file

Added: trunk/lib/fantasdic/text/porter_stemming.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/porter_stemming.rb	Tue Jan  6 17:13:11 2009
@@ -0,0 +1,175 @@
+#
+# This is the Porter Stemming algorithm, ported to Ruby from the
+# version coded up in Perl.  It's easy to follow against the rules
+# in the original paper in:
+#
+#   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+#   no. 3, pp 130-137,
+#
+# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
+#
+module Text # :nodoc:
+module PorterStemming
+
+  STEP_2_LIST = {
+    'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
+    'izer' => 'ize', 'bli' => 'ble',
+    'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
+    'ization' => 'ize', 'ation' => 'ate',
+    'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
+    'ousness' => 'ous', 'aliti' => 'al',
+    'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
+  }
+  
+  STEP_3_LIST = {
+    'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
+    'ical' => 'ic', 'ful' => '', 'ness' => ''
+  }
+
+  SUFFIX_1_REGEXP = /(
+                    ational  |
+                    tional   |
+                    enci     |
+                    anci     |
+                    izer     |
+                    bli      |
+                    alli     |
+                    entli    |
+                    eli      |
+                    ousli    |
+                    ization  |
+                    ation    |
+                    ator     |
+                    alism    |
+                    iveness  |
+                    fulness  |
+                    ousness  |
+                    aliti    |
+                    iviti    |
+                    biliti   |
+                    logi)$/x
+
+  SUFFIX_2_REGEXP = /(
+                      al       |
+                      ance     |
+                      ence     |
+                      er       |
+                      ic       | 
+                      able     |
+                      ible     |
+                      ant      |
+                      ement    |
+                      ment     |
+                      ent      |
+                      ou       |
+                      ism      |
+                      ate      |
+                      iti      |
+                      ous      |
+                      ive      |
+                      ize)$/x
+
+  C = "[^aeiou]"             # consonant
+  V = "[aeiouy]"             # vowel
+  CC = "#{C}(?>[^aeiouy]*)"  # consonant sequence
+  VV = "#{V}(?>[aeiou]*)"    # vowel sequence
+
+  MGR0 = /^(#{CC})?#{VV}#{CC}/o                # [cc]vvcc... is m>0
+  MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o       # [cc]vvcc[vv] is m=1
+  MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o      # [cc]vvccvvcc... is m>1
+  VOWEL_IN_STEM   = /^(#{CC})?#{V}/o           # vowel in stem
+  
+  def self.stem(word)
+
+    # make a copy of the given object and convert it to a string.
+    word = word.dup.to_str
+    
+    return word if word.length < 3
+    
+    # now map initial y to Y so that the patterns never treat it as vowel
+    word[0] = 'Y' if word[0] == ?y
+    
+    # Step 1a
+    if word =~ /(ss|i)es$/
+      word = $` + $1
+    elsif word =~ /([^s])s$/ 
+      word = $` + $1
+    end
+
+    # Step 1b
+    if word =~ /eed$/
+      word.chop! if $` =~ MGR0 
+    elsif word =~ /(ed|ing)$/
+      stem = $`
+      if stem =~ VOWEL_IN_STEM 
+        word = stem
+        case word
+          when /(at|bl|iz)$/             then word << "e"
+          when /([^aeiouylsz])\1$/       then word.chop!
+          when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
+        end
+      end
+    end
+
+    if word =~ /y$/ 
+      stem = $`
+      word = stem + "i" if stem =~ VOWEL_IN_STEM 
+    end
+
+    # Step 2
+    if word =~ SUFFIX_1_REGEXP
+      stem = $`
+      suffix = $1
+      # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+      if stem =~ MGR0
+        word = stem + STEP_2_LIST[suffix]
+      end
+    end
+
+    # Step 3
+    if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+      stem = $`
+      suffix = $1
+      if stem =~ MGR0
+        word = stem + STEP_3_LIST[suffix]
+      end
+    end
+
+    # Step 4
+    if word =~ SUFFIX_2_REGEXP
+      stem = $`
+      if stem =~ MGR1
+        word = stem
+      end
+    elsif word =~ /(s|t)(ion)$/
+      stem = $` + $1
+      if stem =~ MGR1
+        word = stem
+      end
+    end
+
+    #  Step 5
+    if word =~ /e$/ 
+      stem = $`
+      if (stem =~ MGR1) ||
+          (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+        word = stem
+      end
+    end
+
+    if word =~ /ll$/ && word =~ MGR1
+      word.chop!
+    end
+
+    # and turn initial Y back to y
+    word[0] = 'y' if word[0] == ?Y
+
+    word
+  end
+
+end
+end
+
+class String
+  def stem; Text::PorterStemming.stem(self); end
+end
\ No newline at end of file

Added: trunk/lib/fantasdic/text/soundex.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/soundex.rb	Tue Jan  6 17:13:11 2009
@@ -0,0 +1,66 @@
+#
+# Ruby implementation of the Soundex algorithm,
+# as described by Knuth in volume 3 of The Art of Computer Programming.
+#
+# Author: Michael Neumann (neumann s-direktnet de)
+#
+
+module Text # :nodoc:
+module Soundex
+
+  def soundex(str_or_arr)
+    case str_or_arr
+    when String
+      soundex_str(str_or_arr)
+    when Array
+      str_or_arr.collect{|ele| soundex_str(ele)}
+    else
+      nil
+    end
+  end
+  module_function :soundex
+
+  private
+
+  #
+  # returns nil if the value couldn't be calculated (empty-string, wrong-character)
+  # do not change the parameter "str"
+  #
+  def soundex_str(str)
+    return nil if str.empty?
+
+    str = str.upcase
+    last_code = get_code(str[0,1])
+    soundex_code = str[0,1]
+
+    for index in 1...(str.size) do
+      return soundex_code if soundex_code.size == 4
+
+      code = get_code(str[index,1])
+    
+      if code == "0" then
+        last_code = nil
+      elsif code == nil then
+        return nil
+      elsif code != last_code then
+        soundex_code += code
+        last_code = code        
+      end 
+    end # for
+  
+    return soundex_code + "000"[0,4-soundex_code.size]
+  end
+  module_function :soundex_str
+          
+  def get_code(char)
+    char.tr! "AEIOUYWHBPFVCSKGJQXZDTLMNR", "00000000111122222222334556"
+  end
+  module_function :get_code
+
+end # module Soundex
+end # module Text
+
+
+class String
+  def soundex; Text::Soundex.soundex(self); end
+end
\ No newline at end of file



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]