fantasdic r380 - in trunk: . lib lib/fantasdic lib/fantasdic/sources lib/fantasdic/text
- From: mblondel svn gnome org
- To: svn-commits-list gnome org
- Subject: fantasdic r380 - in trunk: . lib lib/fantasdic lib/fantasdic/sources lib/fantasdic/text
- Date: Tue, 6 Jan 2009 17:13:11 +0000 (UTC)
Author: mblondel
Date: Tue Jan 6 17:13:11 2009
New Revision: 380
URL: http://svn.gnome.org/viewvc/fantasdic?rev=380&view=rev
Log:
* lib/fantasdic/text/porter_stemming.rb: Support for stemming (e.g.
stemming => stem).
* lib/fantasdic/text/metaphone.rb: Algorithm to compare the
pronunciation of two strings.
* lib/fantasdic/text/double_metaphone.rb: Same.
* lib/fantasdic/text/soundex.rb: Same.
* lib/fantasdic/text/levenshtein.rb: Levenshtein distance.
All those files are imported from http://rubyforge.org/projects/text.
License is either public domain or Ruby license (GPL compatible)
* lib/fantasdic/sources/dictd_file.rb: Support for the above methods +
Regexp.
* lib/fantasdic/sources/stardict_file.rb: Same.
* lib/fantasdic/file_source.rb: Some factoring.
* lib/fantasdic.rb: Necessary "require"s.
Added:
trunk/lib/fantasdic/text/
trunk/lib/fantasdic/text/double_metaphone.rb
trunk/lib/fantasdic/text/levenshtein.rb
trunk/lib/fantasdic/text/metaphone.rb
trunk/lib/fantasdic/text/porter_stemming.rb
trunk/lib/fantasdic/text/soundex.rb
Modified:
trunk/ChangeLog
trunk/lib/fantasdic.rb
trunk/lib/fantasdic/file_source.rb
trunk/lib/fantasdic/sources/dictd_file.rb
trunk/lib/fantasdic/sources/stardict_file.rb
Modified: trunk/lib/fantasdic.rb
==============================================================================
--- trunk/lib/fantasdic.rb (original)
+++ trunk/lib/fantasdic.rb Tue Jan 6 17:13:11 2009
@@ -134,13 +134,22 @@
require 'fantasdic/config'
require 'fantasdic/version'
-require 'fantasdic/preferences'
+
require 'fantasdic/net/sockssocket'
require 'fantasdic/net/dict'
-require 'fantasdic/utils'
-require 'fantasdic/command_line'
+
+require 'fantasdic/text/porter_stemming'
+require 'fantasdic/text/levenshtein'
+require 'fantasdic/text/soundex'
+require 'fantasdic/text/metaphone'
+require 'fantasdic/text/double_metaphone'
+
require 'fantasdic/ui'
+
+require 'fantasdic/preferences'
+require 'fantasdic/command_line'
+require 'fantasdic/utils'
require 'fantasdic/binary_search'
require 'fantasdic/dictzip'
require 'fantasdic/source_base'
-require 'fantasdic/file_source'
\ No newline at end of file
+require 'fantasdic/file_source'
Modified: trunk/lib/fantasdic/file_source.rb
==============================================================================
--- trunk/lib/fantasdic/file_source.rb (original)
+++ trunk/lib/fantasdic/file_source.rb Tue Jan 6 17:13:11 2009
@@ -18,6 +18,108 @@
module Fantasdic
module Source
+class DictionaryIndex < File
+ include FileBinarySearch
+
+ MAX_LEV_DISTANCE = 2
+
+ def match_exact(word)
+ match_binary_search(word) do |s1, s2|
+ s1 <=> s2
+ end
+ end
+
+ def match_prefix(word)
+ match_binary_search(word) do |s1, s2|
+ if s1 =~ /^#{s2}/
+ 0
+ else
+ s1 <=> s2
+ end
+ end
+ end
+
+ def match_suffix(word)
+ get_word_list.find_all do |curr_word, offset, len|
+ curr_word =~ /#{word}$/
+ end
+ end
+
+ def match_substring(word)
+ get_word_list.find_all do |curr_word, offset, len|
+ curr_word.include?(word)
+ end
+ end
+
+ def match_word(word)
+ match_substring(word).find_all do |curr_word, offset, len|
+ ret = false
+ curr_word.split(" ").each do |single_word|
+ if single_word == word
+ ret = true
+ break
+ end
+ end
+ ret
+ end
+ end
+
+ def match_stem(word)
+ match_prefix(word.stem)
+ end
+
+ def match_lev(word)
+ get_word_list.find_all do |curr_word, offset, len|
+ word.levenshtein(curr_word) < MAX_LEV_DISTANCE
+ end
+ end
+
+ def match_soundex(word)
+ soundex = word.soundex
+ get_word_list.find_all do |curr_word, offset, len|
+ soundex == curr_word.soundex
+ end
+ end
+
+ def match_metaphone(word)
+ metaphone = word.metaphone
+ get_word_list.find_all do |curr_word, offset, len|
+ metaphone == curr_word.metaphone
+ end
+ end
+
+ def match_metaphone2(word)
+ def is_equal?(pair1, pair2)
+ pair1.each do |snd1|
+ next if not snd1
+ pair2.each do |snd2|
+ next if not snd2
+ return true if snd1 == snd2
+ end
+ end
+ return false
+ end
+
+ pair1 = word.double_metaphone
+ get_word_list.find_all do |curr_word, offset, len|
+ is_equal?(pair1, curr_word.double_metaphone)
+ end
+ end
+
+ def match_regexp(regexp)
+ begin
+ r = Regexp.new(regexp)
+ rescue RegexpError
+ []
+ else
+ get_word_list.find_all do |curr_word, offset, len|
+ curr_word =~ r
+ end
+ end
+ end
+
+end
+
class FileSource < Base
class ConfigWidget < Base::ConfigWidget
Modified: trunk/lib/fantasdic/sources/dictd_file.rb
==============================================================================
--- trunk/lib/fantasdic/sources/dictd_file.rb (original)
+++ trunk/lib/fantasdic/sources/dictd_file.rb Tue Jan 6 17:13:11 2009
@@ -20,8 +20,7 @@
module Fantasdic
module Source
-class DictdIndex < File
- include FileBinarySearch
+class DictdIndex < DictionaryIndex
B64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/".
split(//)
@@ -113,22 +112,6 @@
end
end
- def match_exact(word)
- match_binary_search(word) do |s1, s2|
- s1 <=> s2
- end
- end
-
- def match_prefix(word)
- match_binary_search(word) do |s1, s2|
- if s1 =~ /^#{s2}/
- 0
- else
- s1 <=> s2
- end
- end
- end
-
def match_suffix(word)
word = Regexp.escape(word)
self.grep(/#{word}\t/).map do |line|
@@ -147,22 +130,6 @@
end
end
- def match_word(word)
- word = Regexp.escape(word)
- self.grep(/#{word}/).map do |line|
- DictdIndex.get_fields(line)
- end.find_all do |curr_word, offset, len|
- ret = false
- curr_word.split(" ").each do |single_word|
- if single_word == word
- ret = true
- break
- end
- end
- ret
- end
- end
-
def get_word_list
self.rewind
self.lines.map { |line| DictdIndex.get_fields(line) }
@@ -182,9 +149,19 @@
STRATEGIES_DESC = {
"define" => "Results match with the word exactly.",
"prefix" => "Results match with the beginning of the word.",
- "word" => "Results have one word that match with the word.",
+ "word" => "Results have one word that matches with the word.",
"substring" => "Results have a portion that contains the word.",
- "suffix" => "Results match with the end of the word."
+ "suffix" => "Results match with the end of the word.",
+ "stem" => "Results share the same root as the word.",
+ "lev" => "Results are close to the word according to the " + \
+ "levenshtein distance.",
+ "soundex" => "Results have similar pronunciation according " + \
+ "to the soundex algorithm.",
+ "metaphone" => "Results have similar pronunciation according " + \
+ "to the metaphone algorithm.",
+ "metaphone2" => "Results have similar pronunciation according " + \
+ "to the double metaphone algorithm.",
+ "regexp" => "Results match the regular expression."
}
class ConfigWidget < FileSource::ConfigWidget
@@ -252,14 +229,12 @@
end
def match(db, strat, word)
- matches = []
-
- dictd_file_open do |index_file, dict_file|
- matches = case strat
- when "prefix", "suffix", "substring", "word"
- index_file.send("match_#{strat}", word)
- else
- []
+ matches = dictd_file_open do |index_file, dict_file|
+ meth = "match_#{strat}"
+ if index_file.respond_to? meth
+ index_file.send(meth, word)
+ else
+ []
end.map do |match, offset, len|
match
end
Modified: trunk/lib/fantasdic/sources/stardict_file.rb
==============================================================================
--- trunk/lib/fantasdic/sources/stardict_file.rb (original)
+++ trunk/lib/fantasdic/sources/stardict_file.rb Tue Jan 6 17:13:11 2009
@@ -56,7 +56,7 @@
end
-class StardictIndex < File
+class StardictIndex < DictionaryIndex
OFFSET_INT_SIZE = 4
LEN_INT_SIZE = 4
@@ -106,47 +106,6 @@
found_offsets.map { |offset| self.get_fields(offset) }
end
- def match_exact(word)
- match_binary_search(word) do |s1, s2|
- s1 <=> s2
- end
- end
-
- def match_prefix(word)
- match_binary_search(word) do |s1, s2|
- if s1 =~ /^#{s2}/
- 0
- else
- s1 <=> s2
- end
- end
- end
-
- def match_suffix(word)
- get_word_list.find_all do |curr_word, offset, len|
- curr_word =~ /#{word}$/
- end
- end
-
- def match_substring(word)
- get_word_list.find_all do |curr_word, offset, len|
- curr_word.include?(word)
- end
- end
-
- def match_word(word)
- match_substring(word).find_all do |curr_word, offset, len|
- ret = false
- curr_word.split(" ").each do |single_word|
- if single_word == word
- ret = true
- break
- end
- end
- ret
- end
- end
-
# Returns the offsets of the beginning of each entry in the index
def get_index_offsets
self.rewind
@@ -199,9 +158,19 @@
STRATEGIES_DESC = {
"define" => "Results match with the word exactly.",
"prefix" => "Results match with the beginning of the word.",
- "word" => "Results have one word that match with the word.",
+ "word" => "Results have one word that matches with the word.",
"substring" => "Results have a portion that contains the word.",
- "suffix" => "Results match with the end of the word."
+ "suffix" => "Results match with the end of the word.",
+ "stem" => "Results share the same root as the word.",
+ "lev" => "Results are close to the word according to the " + \
+ "levenshtein distance.",
+ "soundex" => "Results have similar pronunciation according " + \
+ "to the soundex algorithm.",
+ "metaphone" => "Results have similar pronunciation according " + \
+ "to the metaphone algorithm.",
+ "metaphone2" => "Results have similar pronunciation according " + \
+ "to the double metaphone algorithm.",
+ "regexp" => "Results match the regular expression."
}
class ConfigWidget < FileSource::ConfigWidget
@@ -254,14 +223,12 @@
end
def match(db, strat, word)
- matches = []
-
- stardict_file_open do |index_file, dict_file, file_info|
- matches = case strat
- when "prefix", "suffix", "substring", "word"
- index_file.send("match_#{strat}", word)
- else
- []
+ matches = stardict_file_open do |index_file, dict_file, file_info|
+ meth = "match_#{strat}"
+ if index_file.respond_to? meth
+ index_file.send(meth, word)
+ else
+ []
end.map do |match, offset, len|
match
end
Added: trunk/lib/fantasdic/text/double_metaphone.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/double_metaphone.rb Tue Jan 6 17:13:11 2009
@@ -0,0 +1,360 @@
+#
+# Ruby implementation of the Double Metaphone algorithm by Lawrence Philips,
+# originally published in the June 2000 issue of C/C++ Users Journal.
+#
+# Based on Stephen Woodbridge's PHP version - http://swoodbridge.com/DoubleMetaPhone/
+#
+# Author: Tim Fletcher (twoggle gmail com)
+#
+
+module Text # :nodoc:
+module Metaphone
+
+ # Returns the primary and secondary double metaphone tokens
+ # (the secondary will be nil if equal to the primary).
+ def double_metaphone(str)
+ primary, secondary, current = [], [], 0
+ original, length, last = "#{str} ".upcase, str.length, str.length - 1
+ if /^GN|KN|PN|WR|PS$/ =~ original[0, 2]
+ current += 1
+ end
+ if 'X' == original[0, 1]
+ primary << :S
+ secondary << :S
+ current += 1
+ end
+ while primary.length < 4 || secondary.length < 4
+ break if current > str.length
+ a, b, c = double_metaphone_lookup(original, current, length, last)
+ primary << a if a
+ secondary << b if b
+ current += c if c
+ end
+ primary, secondary = primary.to_s[0, 4], secondary.to_s[0, 4]
+ return primary, (primary == secondary ? nil : secondary)
+ end
+
+
+ private
+
+ def slavo_germanic?(str)
+ /W|K|CZ|WITZ/ =~ str
+ end
+
+ def vowel?(str)
+ /^A|E|I|O|U|Y$/ =~ str
+ end
+
+ def double_metaphone_lookup(str, pos, length, last)
+ case str[pos, 1]
+ when /^A|E|I|O|U|Y$/
+ if 0 == pos
+ return :A, :A, 1
+ else
+ return nil, nil, 1
+ end
+ when 'B'
+ return :P, :P, ('B' == str[pos + 1, 1] ? 2 : 1)
+ when 'Ã'
+ return :S, :S, 1
+ when 'C'
+ if pos > 1 &&
+ !vowel?(str[pos - 2, 1]) &&
+ 'ACH' == str[pos - 1, 3] &&
+ str[pos + 2, 1] != 'I' && (
+ str[pos + 2, 1] != 'E' ||
+ str[pos - 2, 6] =~ /^(B|M)ACHER$/
+ ) then
+ return :K, :K, 2
+ elsif 0 == pos && 'CAESAR' == str[pos, 6]
+ return :S, :S, 2
+ elsif 'CHIA' == str[pos, 4]
+ return :K, :K, 2
+ elsif 'CH' == str[pos, 2]
+ if pos > 0 && 'CHAE' == str[pos, 4]
+ return :K, :X, 2
+ elsif 0 == pos && (
+ ['HARAC', 'HARIS'].include?(str[pos + 1, 5]) ||
+ ['HOR', 'HYM', 'HIA', 'HEM'].include?(str[pos + 1, 3])
+ ) && str[0, 5] != 'CHORE' then
+ return :K, :K, 2
+ elsif ['VAN ','VON '].include?(str[0, 4]) ||
+ 'SCH' == str[0, 3] ||
+ ['ORCHES','ARCHIT','ORCHID'].include?(str[pos - 2, 6]) ||
+ ['T','S'].include?(str[pos + 2, 1]) || (
+ ((0 == pos) || ['A','O','U','E'].include?(str[pos - 1, 1])) &&
+ ['L','R','N','M','B','H','F','V','W',' '].include?(str[pos + 2, 1])
+ ) then
+ return :K, :K, 2
+ elsif pos > 0
+ return ('MC' == str[0, 2] ? 'K' : 'X'), 'K', 2
+ else
+ return :X, :X, 2
+ end
+ elsif 'CZ' == str[pos, 2] && 'WICZ' != str[pos - 2, 4]
+ return :S, :X, 2
+ elsif 'CIA' == str[pos + 1, 3]
+ return :X, :X, 3
+ elsif 'CC' == str[pos, 2] && !(1 == pos && 'M' == str[0, 1])
+ if /^I|E|H$/ =~ str[pos + 2, 1] && 'HU' != str[pos + 2, 2]
+ if (1 == pos && 'A' == str[pos - 1, 1]) ||
+ /^UCCE(E|S)$/ =~ str[pos - 1, 5] then
+ return :KS, :KS, 3
+ else
+ return :X, :X, 3
+ end
+ else
+ return :K, :K, 2
+ end
+ elsif /^C(K|G|Q)$/ =~ str[pos, 2]
+ return :K, :K, 2
+ elsif /^C(I|E|Y)$/ =~ str[pos, 2]
+ return :S, (/^CI(O|E|A)$/ =~ str[pos, 3] ? :X : :S), 2
+ else
+ if /^ (C|Q|G)$/ =~ str[pos + 1, 2]
+ return :K, :K, 3
+ else
+ return :K, :K, (/^C|K|Q$/ =~ str[pos + 1, 1] && !(['CE','CI'].include?(str[pos + 1, 2])) ? 2 : 1)
+ end
+ end
+ when 'D'
+ if 'DG' == str[pos, 2]
+ if /^I|E|Y$/ =~ str[pos + 2, 1]
+ return :J, :J, 3
+ else
+ return :TK, :TK, 2
+ end
+ else
+ return :T, :T, (/^D(T|D)$/ =~ str[pos, 2] ? 2 : 1)
+ end
+ when 'F'
+ return :F, :F, ('F' == str[pos + 1, 1] ? 2 : 1)
+ when 'G'
+ if 'H' == str[pos + 1, 1]
+ if pos > 0 && !vowel?(str[pos - 1, 1])
+ return :K, :K, 2
+ elsif 0 == pos
+ if 'I' == str[pos + 2, 1]
+ return :J, :J, 2
+ else
+ return :K, :K, 2
+ end
+ elsif (pos > 1 && /^B|H|D$/ =~ str[pos - 2, 1]) ||
+ (pos > 2 && /^B|H|D$/ =~ str[pos - 3, 1]) ||
+ (pos > 3 && /^B|H$/ =~ str[pos - 4, 1])
+ return nil, nil, 2
+ else
+ if (pos > 2 && 'U' == str[pos - 1, 1] && /^C|G|L|R|T$/ =~ str[pos - 3, 1])
+ return :F, :F, 2
+ elsif pos > 0 && 'I' != str[pos - 1, 1]
+ return :K, :K, 2
+ else
+ return nil, nil, 2
+ end
+ end
+ elsif 'N' == str[pos + 1, 1]
+ if 1 == pos && vowel?(str[0, 1]) && !slavo_germanic?(str)
+ return :KN, :N, 2
+ else
+ if 'EY' != str[pos + 2, 2] && 'Y' != str[pos + 1, 1] && !slavo_germanic?(str)
+ return :N, :KN, 2
+ else
+ return :KN, :KN, 2
+ end
+ end
+ elsif 'LI' == str[pos + 1, 2] && !slavo_germanic?(str)
+ return :KL, :L, 2
+ elsif 0 == pos && ('Y' == str[pos + 1, 1] || /^(E(S|P|B|L|Y|I|R)|I(B|L|N|E))$/ =~ str[pos + 1, 2])
+ return :K, :J, 2
+ elsif (('ER' == str[pos + 1, 2] || 'Y' == str[pos + 1, 1]) &&
+ /^(D|R|M)ANGER$/ !~ str[0, 6] &&
+ /^E|I$/ !~ str[pos - 1, 1] &&
+ /^(R|O)GY$/ !~ str[pos - 1, 3])
+ return :K, :J, 2
+ elsif /^E|I|Y$/ =~ str[pos + 1, 1] || /^(A|O)GGI$/ =~ str[pos - 1, 4]
+ if (/^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]) || 'ET' == str[pos + 1, 2]
+ return :K, :K, 2
+ else
+ if 'IER ' == str[pos + 1, 4]
+ return :J, :J, 2
+ else
+ return :J, :K, 2
+ end
+ end
+ elsif 'G' == str[pos + 1, 1]
+ return :K, :K, 2
+ else
+ return :K, :K, 1
+ end
+ when 'H'
+ if (0 == pos || vowel?(str[pos - 1, 1])) && vowel?(str[pos + 1, 1])
+ return :H, :H, 2
+ else
+ return nil, nil, 1
+ end
+ when 'J'
+ if 'JOSE' == str[pos, 4] || 'SAN ' == str[0, 4]
+ if (0 == pos && ' ' == str[pos + 4, 1]) || 'SAN ' == str[0, 4]
+ return :H, :H, 1
+ else
+ return :J, :H, 1
+ end
+ else
+ current = ('J' == str[pos + 1, 1] ? 2 : 1)
+
+ if 0 == pos && 'JOSE' != str[pos, 4]
+ return :J, :A, current
+ else
+ if vowel?(str[pos - 1, 1]) && !slavo_germanic?(str) && /^A|O$/ =~ str[pos + 1, 1]
+ return :J, :H, current
+ else
+ if last == pos
+ return :J, nil, current
+ else
+ if /^L|T|K|S|N|M|B|Z$/ !~ str[pos + 1, 1] && /^S|K|L$/ !~ str[pos - 1, 1]
+ return :J, :J, current
+ else
+ return nil, nil, current
+ end
+ end
+ end
+ end
+ end
+ when 'K'
+ return :K, :K, ('K' == str[pos + 1, 1] ? 2 : 1)
+ when 'L'
+ if 'L' == str[pos + 1, 1]
+ if (((length - 3) == pos && /^(ILL(O|A)|ALLE)$/ =~ str[pos - 1, 4]) ||
+ ((/^(A|O)S$/ =~ str[last - 1, 2] || /^A|O$/ =~ str[last, 1]) && 'ALLE' == str[pos - 1, 4]))
+ return :L, nil, 2
+ else
+ return :L, :L, 2
+ end
+ else
+ return :L, :L, 1
+ end
+ when 'M'
+ if ('UMB' == str[pos - 1, 3] &&
+ ((last - 1) == pos || 'ER' == str[pos + 2, 2])) || 'M' == str[pos + 1, 1]
+ return :M, :M, 2
+ else
+ return :M, :M, 1
+ end
+ when 'N'
+ return :N, :N, ('N' == str[pos + 1, 1] ? 2 : 1)
+ when 'Ã'
+ return :N, :N, 1
+ when 'P'
+ if 'H' == str[pos + 1, 1]
+ return :F, :F, 2
+ else
+ return :P, :P, (/^P|B$/ =~ str[pos + 1, 1] ? 2 : 1)
+ end
+ when 'Q'
+ return :K, :K, ('Q' == str[pos + 1, 1] ? 2 : 1)
+ when 'R'
+ current = ('R' == str[pos + 1, 1] ? 2 : 1)
+
+ if last == pos && !slavo_germanic?(str) && 'IE' == str[pos - 2, 2] && /^M(E|A)$/ !~ str[pos - 4, 2]
+ return nil, :R, current
+ else
+ return :R, :R, current
+ end
+ when 'S'
+ if /^(I|Y)SL$/ =~ str[pos - 1, 3]
+ return nil, nil, 1
+ elsif 0 == pos && 'SUGAR' == str[pos, 5]
+ return :X, :S, 1
+ elsif 'SH' == str[pos, 2]
+ if /^H(EIM|OEK|OLM|OLZ)$/ =~ str[pos + 1, 4]
+ return :S, :S, 2
+ else
+ return :X, :X, 2
+ end
+ elsif /^SI(O|A)$/ =~ str[pos, 3] || 'SIAN' == str[pos, 4]
+ return :S, (slavo_germanic?(str) ? :S : :X), 3
+ elsif (0 == pos && /^M|N|L|W$/ =~ str[pos + 1, 1]) || 'Z' == str[pos + 1, 1]
+ return :S, :X, ('Z' == str[pos + 1, 1] ? 2 : 1)
+ elsif 'SC' == str[pos, 2]
+ if 'H' == str[pos + 2, 1]
+ if /^OO|ER|EN|UY|ED|EM$/ =~ str[pos + 3, 2]
+ return (/^E(R|N)$/ =~ str[pos + 3, 2] ? :X : :SK), :SK, 3
+ else
+ return :X, ((0 == pos && !vowel?(str[3, 1]) && ('W' != str[pos + 3, 1])) ? :S : :X), 3
+ end
+ elsif /^I|E|Y$/ =~ str[pos + 2, 1]
+ return :S, :S, 3
+ else
+ return :SK, :SK, 3
+ end
+ else
+ return (last == pos && /^(A|O)I$/ =~ str[pos - 2, 2] ? nil : 'S'), 'S', (/^S|Z$/ =~ str[pos + 1, 1] ? 2 : 1)
+ end
+ when 'T'
+ if 'TION' == str[pos, 4]
+ return :X, :X, 3
+ elsif /^T(IA|CH)$/ =~ str[pos, 3]
+ return :X, :X, 3
+ elsif 'TH' == str[pos, 2] || 'TTH' == str[pos, 3]
+ if /^(O|A)M$/ =~ str[pos + 2, 2] || /^V(A|O)N $/ =~ str[0, 4] || 'SCH' == str[0, 3]
+ return :T, :T, 2
+ else
+ return 0, :T, 2
+ end
+ else
+ return :T, :T, (/^T|D$/ =~ str[pos + 1, 1] ? 2 : 1)
+ end
+ when 'V'
+ return :F, :F, ('V' == str[pos + 1, 1] ? 2 : 1)
+ when 'W'
+ if 'WR' == str[pos, 2]
+ return :R, :R, 2
+ end
+ pri, sec = nil, nil
+
+ if 0 == pos && (vowel?(str[pos + 1, 1]) || 'WH' == str[pos, 2])
+ pri = :A
+ sec = vowel?(str[pos + 1, 1]) ? :F : :A
+ end
+
+ if (last == pos && vowel?(str[pos - 1, 1])) || 'SCH' == str[0, 3] ||
+ /^EWSKI|EWSKY|OWSKI|OWSKY$/ =~ str[pos - 1, 5]
+ return pri, "#{sec}F".intern, 1
+ elsif /^WI(C|T)Z$/ =~ str[pos, 4]
+ return "#{pri}TS".intern, "#{sec}FX".intern, 4
+ else
+ return pri, sec, 1
+ end
+ when 'X'
+ current = (/^C|X$/ =~ str[pos + 1, 1] ? 2 : 1)
+
+ if !(last == pos && (/^(I|E)AU$/ =~ str[pos - 3, 3] || /^(A|O)U$/ =~ str[pos - 2, 2]))
+ return :KS, :KS, current
+ else
+ return nil, nil, current
+ end
+ when 'Z'
+ if 'H' == str[pos + 1, 1]
+ return :J, :J, 2
+ else
+ current = ('Z' == str[pos + 1, 1] ? 2 : 1)
+
+ if /^Z(O|I|A)$/ =~ str[pos + 1, 2] || (slavo_germanic?(str) && (pos > 0 && 'T' != str[pos - 1, 1]))
+ return :S, :TS, current
+ else
+ return :S, :S, current
+ end
+ end
+ else
+ return nil, nil, 1
+ end
+ end # def double_metaphone_lookup
+
+ extend self
+
+end # module Metaphone
+end # module Text
+
+class String
+ def double_metaphone; Text::Metaphone.double_metaphone(self); end
+end
Added: trunk/lib/fantasdic/text/levenshtein.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/levenshtein.rb Tue Jan 6 17:13:11 2009
@@ -0,0 +1,69 @@
+#
+# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support.
+#
+# The Levenshtein distance is a measure of how similar two strings s and t are,
+# calculated as the number of deletions/insertions/substitutions needed to
+# transform s into t. The greater the distance, the more the strings differ.
+#
+# The Levenshtein distance is also sometimes referred to as the
+# easier-to-pronounce-and-spell 'edit distance'.
+#
+# Author: Paul Battley (pbattley gmail com)
+#
+
+module Text # :nodoc:
+module Levenshtein
+
+ # Calculate the Levenshtein distance between two strings +str1+ and +str2+.
+ # +str1+ and +str2+ should be ASCII, UTF-8, or a one-byte-per character encoding such
+ # as ISO-8859-*.
+ #
+ # The strings will be treated as UTF-8 if $KCODE is set appropriately (i.e. 'u').
+ # Otherwise, the comparison will be performed byte-by-byte. There is no specific support
+ # for Shift-JIS or EUC strings.
+ #
+ # When using Unicode text, be aware that this algorithm does not perform normalisation.
+ # If there is a possibility of different normalised forms being used, normalisation
+ # should be performed beforehand.
+ #
+ def distance(str1, str2)
+ if $KCODE =~ /^U/i
+ unpack_rule = 'U*'
+ else
+ unpack_rule = 'C*'
+ end
+ s = str1.unpack(unpack_rule)
+ t = str2.unpack(unpack_rule)
+ n = s.length
+ m = t.length
+ return m if (0 == n)
+ return n if (0 == m)
+
+ d = (0..m).to_a
+ x = nil
+
+ (0...n).each do |i|
+ e = i+1
+ (0...m).each do |j|
+ cost = (s[i] == t[j]) ? 0 : 1
+ x = [
+ d[j+1] + 1, # insertion
+ e + 1, # deletion
+ d[j] + cost # substitution
+ ].min
+ d[j] = e
+ e = x
+ end
+ d[m] = x
+ end
+
+ return x
+ end
+
+ extend self
+end
+end
+
+class String
+ def levenshtein(str); Text::Levenshtein.distance(self, str); end
+end
\ No newline at end of file
Added: trunk/lib/fantasdic/text/metaphone.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/metaphone.rb Tue Jan 6 17:13:11 2009
@@ -0,0 +1,101 @@
+#
+# An implementation of the Metaphone phonetic coding system in Ruby.
+#
+# Metaphone encodes names into a phonetic form such that similar-sounding names
+# have the same or similar Metaphone encodings.
+#
+# The original system was described by Lawrence Philips in Computer Language
+# Vol. 7 No. 12, December 1990, pp 39-43.
+#
+# As there are multiple implementations of Metaphone, each with their own
+# quirks, I have based this on my interpretation of the algorithm specification.
+# Even LP's original BASIC implementation appears to contain bugs (specifically
+# with the handling of CC and MB), when compared to his explanation of the
+# algorithm.
+#
+# I have also compared this implementation with that found in PHP's standard
+# library, which appears to mimic the behaviour of LP's original BASIC
+# implementation. For compatibility, these rules can also be used by passing
+# :buggy=>true to the methods.
+#
+# Author: Paul Battley (pbattley gmail com)
+#
+
+module Text # :nodoc:
+module Metaphone
+
+ module Rules # :nodoc:all
+
+ # Metaphone rules. These are simply applied in order.
+ #
+ STANDARD = [
+ # Regexp, replacement
+ [ /([bcdfhjklmnpqrstvwxyz])\1+/,
+ '\1' ], # Remove doubled consonants except g.
+ # [PHP] remove c from regexp.
+ [ /^ae/, 'E' ],
+ [ /^[gkp]n/, 'N' ],
+ [ /^wr/, 'R' ],
+ [ /^x/, 'S' ],
+ [ /^wh/, 'W' ],
+ [ /mb$/, 'M' ], # [PHP] remove $ from regexp.
+ [ /(?!^)sch/, 'SK' ],
+ [ /th/, '0' ],
+ [ /t?ch|sh/, 'X' ],
+ [ /c(?=ia)/, 'X' ],
+ [ /[st](?=i[ao])/, 'X' ],
+ [ /s?c(?=[iey])/, 'S' ],
+ [ /[cq]/, 'K' ],
+ [ /dg(?=[iey])/, 'J' ],
+ [ /d/, 'T' ],
+ [ /g(?=h[^aeiou])/, '' ],
+ [ /gn(ed)?/, 'N' ],
+ [ /([^g]|^)g(?=[iey])/,
+ '\1J' ],
+ [ /g+/, 'K' ],
+ [ /ph/, 'F' ],
+ [ /([aeiou])h(?=\b|[^aeiou])/,
+ '\1' ],
+ [ /[wy](?![aeiou])/, '' ],
+ [ /z/, 'S' ],
+ [ /v/, 'F' ],
+ [ /(?!^)[aeiou]+/, '' ],
+ ]
+
+ # The rules for the 'buggy' alternate implementation used by PHP etc.
+ #
+ BUGGY = STANDARD.dup
+ BUGGY[0] = [ /([bdfhjklmnpqrstvwxyz])\1+/, '\1' ]
+ BUGGY[6] = [ /mb/, 'M' ]
+ end
+
+ # Returns the Metaphone representation of a string. If the string contains
+ # multiple words, each word in turn is converted into its Metaphone
+ # representation. Note that only the letters A-Z are supported, so any
+ # language-specific processing should be done beforehand.
+ #
+ # If the :buggy option is set, alternate 'buggy' rules are used.
+ #
+ def metaphone(str, options={})
+ return str.strip.split(/\s+/).map { |w| metaphone_word(w, options) }.join(' ')
+ end
+
+private
+
+ def metaphone_word(w, options={})
+ # Normalise case and remove non-ASCII
+ s = w.downcase.gsub(/[^a-z]/, '')
+ # Apply the Metaphone rules
+ rules = options[:buggy] ? Rules::BUGGY : Rules::STANDARD
+ rules.each { |rx, rep| s.gsub!(rx, rep) }
+ return s.upcase
+ end
+
+ extend self
+
+end
+end
+
+class String
+ def metaphone; Text::Metaphone.metaphone(self); end
+end
\ No newline at end of file
Added: trunk/lib/fantasdic/text/porter_stemming.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/porter_stemming.rb Tue Jan 6 17:13:11 2009
@@ -0,0 +1,175 @@
+#
+# This is the Porter Stemming algorithm, ported to Ruby from the
+# version coded up in Perl. It's easy to follow against the rules
+# in the original paper in:
+#
+# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+# no. 3, pp 130-137,
+#
+# Taken from http://www.tartarus.org/~martin/PorterStemmer (Public Domain)
+#
+module Text # :nodoc:
+module PorterStemming
+
+ STEP_2_LIST = {
+ 'ational' => 'ate', 'tional' => 'tion', 'enci' => 'ence', 'anci' => 'ance',
+ 'izer' => 'ize', 'bli' => 'ble',
+ 'alli' => 'al', 'entli' => 'ent', 'eli' => 'e', 'ousli' => 'ous',
+ 'ization' => 'ize', 'ation' => 'ate',
+ 'ator' => 'ate', 'alism' => 'al', 'iveness' => 'ive', 'fulness' => 'ful',
+ 'ousness' => 'ous', 'aliti' => 'al',
+ 'iviti' => 'ive', 'biliti' => 'ble', 'logi' => 'log'
+ }
+
+ STEP_3_LIST = {
+ 'icate' => 'ic', 'ative' => '', 'alize' => 'al', 'iciti' => 'ic',
+ 'ical' => 'ic', 'ful' => '', 'ness' => ''
+ }
+
+ SUFFIX_1_REGEXP = /(
+ ational |
+ tional |
+ enci |
+ anci |
+ izer |
+ bli |
+ alli |
+ entli |
+ eli |
+ ousli |
+ ization |
+ ation |
+ ator |
+ alism |
+ iveness |
+ fulness |
+ ousness |
+ aliti |
+ iviti |
+ biliti |
+ logi)$/x
+
+ SUFFIX_2_REGEXP = /(
+ al |
+ ance |
+ ence |
+ er |
+ ic |
+ able |
+ ible |
+ ant |
+ ement |
+ ment |
+ ent |
+ ou |
+ ism |
+ ate |
+ iti |
+ ous |
+ ive |
+ ize)$/x
+
+ C = "[^aeiou]" # consonant
+ V = "[aeiouy]" # vowel
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
+
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
+
+ def self.stem(word)
+
+ # make a copy of the given object and convert it to a string.
+ word = word.dup.to_str
+
+ return word if word.length < 3
+
+ # now map initial y to Y so that the patterns never treat it as vowel
+ word[0] = 'Y' if word[0] == ?y
+
+ # Step 1a
+ if word =~ /(ss|i)es$/
+ word = $` + $1
+ elsif word =~ /([^s])s$/
+ word = $` + $1
+ end
+
+ # Step 1b
+ if word =~ /eed$/
+ word.chop! if $` =~ MGR0
+ elsif word =~ /(ed|ing)$/
+ stem = $`
+ if stem =~ VOWEL_IN_STEM
+ word = stem
+ case word
+ when /(at|bl|iz)$/ then word << "e"
+ when /([^aeiouylsz])\1$/ then word.chop!
+ when /^#{CC}#{V}[^aeiouwxy]$/o then word << "e"
+ end
+ end
+ end
+
+ if word =~ /y$/
+ stem = $`
+ word = stem + "i" if stem =~ VOWEL_IN_STEM
+ end
+
+ # Step 2
+ if word =~ SUFFIX_1_REGEXP
+ stem = $`
+ suffix = $1
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
+ if stem =~ MGR0
+ word = stem + STEP_2_LIST[suffix]
+ end
+ end
+
+ # Step 3
+ if word =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
+ stem = $`
+ suffix = $1
+ if stem =~ MGR0
+ word = stem + STEP_3_LIST[suffix]
+ end
+ end
+
+ # Step 4
+ if word =~ SUFFIX_2_REGEXP
+ stem = $`
+ if stem =~ MGR1
+ word = stem
+ end
+ elsif word =~ /(s|t)(ion)$/
+ stem = $` + $1
+ if stem =~ MGR1
+ word = stem
+ end
+ end
+
+ # Step 5
+ if word =~ /e$/
+ stem = $`
+ if (stem =~ MGR1) ||
+ (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
+ word = stem
+ end
+ end
+
+ if word =~ /ll$/ && word =~ MGR1
+ word.chop!
+ end
+
+ # and turn initial Y back to y
+ word[0] = 'y' if word[0] == ?Y
+
+ word
+ end
+
+end
+end
+
+class String
+ def stem; Text::PorterStemming.stem(self); end
+end
\ No newline at end of file
Added: trunk/lib/fantasdic/text/soundex.rb
==============================================================================
--- (empty file)
+++ trunk/lib/fantasdic/text/soundex.rb Tue Jan 6 17:13:11 2009
@@ -0,0 +1,66 @@
+#
+# Ruby implementation of the Soundex algorithm,
+# as described by Knuth in volume 3 of The Art of Computer Programming.
+#
+# Author: Michael Neumann (neumann s-direktnet de)
+#
+
+module Text # :nodoc:
+module Soundex
+
+ def soundex(str_or_arr)
+ case str_or_arr
+ when String
+ soundex_str(str_or_arr)
+ when Array
+ str_or_arr.collect{|ele| soundex_str(ele)}
+ else
+ nil
+ end
+ end
+ module_function :soundex
+
+ private
+
+ #
+ # returns nil if the value couldn't be calculated (empty-string, wrong-character)
+ # do not change the parameter "str"
+ #
+ def soundex_str(str)
+ return nil if str.empty?
+
+ str = str.upcase
+ last_code = get_code(str[0,1])
+ soundex_code = str[0,1]
+
+ for index in 1...(str.size) do
+ return soundex_code if soundex_code.size == 4
+
+ code = get_code(str[index,1])
+
+ if code == "0" then
+ last_code = nil
+ elsif code == nil then
+ return nil
+ elsif code != last_code then
+ soundex_code += code
+ last_code = code
+ end
+ end # for
+
+ return soundex_code + "000"[0,4-soundex_code.size]
+ end
+ module_function :soundex_str
+
+ def get_code(char)
+ char.tr! "AEIOUYWHBPFVCSKGJQXZDTLMNR", "00000000111122222222334556"
+ end
+ module_function :get_code
+
+end # module Soundex
+end # module Text
+
+
+class String
+ def soundex; Text::Soundex.soundex(self); end
+end
\ No newline at end of file
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]