fantasdic r327 - in trunk: . lib/fantasdic/sources test test/data



Author: mblondel
Date: Fri Aug 22 18:22:17 2008
New Revision: 327
URL: http://svn.gnome.org/viewvc/fantasdic?rev=327&view=rev

Log:
    * test/test_edict_file.rb:
    * test/data/edict.eucjp.gz:
    * test/data/edict.utf8.gz:
    * test/data/edict.utf8:
    * test/data/edict.eucjp: Unit test for EdictFile source + test data.

    * lib/fantasdic/sources/edict_file.rb: Better way to choose between the two
    available implementations. Fixed a bunch of bugs spotted by the unit test.


Added:
   trunk/test/data/edict.eucjp
   trunk/test/data/edict.eucjp.gz   (contents, props changed)
   trunk/test/data/edict.utf8
   trunk/test/data/edict.utf8.gz   (contents, props changed)
   trunk/test/test_edict_file.rb
Modified:
   trunk/ChangeLog
   trunk/lib/fantasdic/sources/edict_file.rb

Modified: trunk/lib/fantasdic/sources/edict_file.rb
==============================================================================
--- trunk/lib/fantasdic/sources/edict_file.rb	(original)
+++ trunk/lib/fantasdic/sources/edict_file.rb	Fri Aug 22 18:22:17 2008
@@ -20,13 +20,7 @@
 module Fantasdic
 module Source
 
-class EdictFile < Base
-    authors ["Mathieu Blondel"]
-    title  _("EDICT file")
-    description _("Look up words in an EDICT file.")
-    license Fantasdic::GPL
-    copyright "Copyright (C) 2007 Mathieu Blondel"
-    no_databases true
+class EdictFileBase < Base
 
     STRATEGIES_DESC = {
         "define" => "Results match with the word exactly.",
@@ -45,7 +39,6 @@
     HAVE_EGREP = (File.which("egrep") and File.which("iconv") and
                   File.which("gunzip") and File.which("cat"))
 
-
     class ConfigWidget < Base::ConfigWidget
         def initialize(*arg)
             super(*arg)
@@ -183,13 +176,13 @@
         wesc = escape_string(word)
 
         if word.latin?
-            regexp = "\/#{wesc}\/"
+            regexp = "/#{wesc}/"
         elsif word.kana?
-            regexp = "^#{wesc} |\[#{wesc}\]"
+            regexp = "^#{wesc} |\\[#{wesc}\\]"
         elsif word.japanese?
             regexp = "^#{wesc} "
         else
-            regexp = "^#{wesc}|\[#{wesc}\]|\/#{wesc}\/"
+            regexp = "^#{wesc}|\\[#{wesc}\\]|/#{wesc}/"
         end
         
         db = File.basename(@hash[:filename])
@@ -198,7 +191,7 @@
         match_with_regexp(regexp).map do |line|
             defi = Definition.new
             defi.word = word
-            defi.body = line
+            defi.body = line.strip
             defi.database = db
             defi.description = db_capitalize
             defi
@@ -232,7 +225,7 @@
 
     def match_word(db, word)
         arr = []
-        match_suffix(db, word).each do |line|
+        match_substring(db, word).each do |line|
             get_fields(line).each do |field|
                 field.split(" ").each do |w|
                     if w ==  word
@@ -249,13 +242,13 @@
     def match_prefix(db, word)
         wesc = escape_string(word)
         if word.latin?
-            regexp = "\/#{wesc}[^\/]*\/"
+            regexp = "/#{wesc}"
         elsif word.kana?
-            regexp = "^#{wesc}|\[#{wesc}[^\]]*\]"
+            regexp = "^#{wesc}| \\[#{wesc}"
         elsif word.japanese?
             regexp = "^#{wesc}"
         else
-            regexp = "^#{wesc}|\[#{wesc}[^\]]*\]|\/#{wesc}[^\/]*\/"
+            regexp = "^#{wesc}|\\[#{wesc}|/#{wesc}"
         end
 
         match_with_regexp(regexp)
@@ -264,13 +257,13 @@
     def match_suffix(db, word)
         wesc = escape_string(word)
         if word.latin?
-            regexp = "\/[^\/]*#{wesc}\/"
+            regexp = "#{wesc}/"
         elsif word.kana?
-            regexp = "^[^\[]*#{wesc} |\[[^\]]*#{wesc}\]"
+            regexp = "#{wesc} \\[|#{wesc}\\]"
         elsif word.japanese?
-            regexp = "^[^\[]*#{wesc} "
+            regexp = "#{wesc} \\["
         else
-            regexp = "^[^\[]*#{wesc} |\[[^\]]*#{wesc}\]|\/[^\/]*#{wesc}\/"
+            regexp = "#{wesc} \\[|#{wesc}\\]|#{wesc}/"
         end
 
         match_with_regexp(regexp)
@@ -321,67 +314,74 @@
         Regexp.escape(str).sub('"', "\\\"")
     end
 
-end # class EdictFile
+end # class EdictFileBase
 
-if EdictFile::HAVE_EGREP
-    # Using egrep. This is significantly faster!
-    class EdictFile
-        def initialize(*args)
-            super(*args)
-            edict_file_open.close # Tries to open file to ensure it exists
-        end
 
-        private
+# Using egrep. This is significantly faster!
+class EdictFileEgrep < EdictFileBase
+    def initialize(*args)
+        super(*args)
+        edict_file_open.close # Tries to open file to ensure it exists
+    end
 
-        def match_with_regexp(regexp)
-            cmd = get_command(regexp)            
-            IO.popen(cmd).readlines
-        end
+    private
 
-        def get_command(regexp)
-            cmd = []
+    def match_with_regexp(regexp)
+        cmd = get_command(regexp)
+        IO.popen(cmd).readlines
+    end
 
-            cmd << "cat #{ hash[:filename]}"
+    def get_command(regexp)
+        cmd = []
 
-            if @hash[:filename] =~ /.gz$/
-                cmd << "gunzip -c"
-            end
+        cmd << "cat #{ hash[:filename]}"
 
-            if @hash[:encoding] and @hash[:encoding] != "UTF-8"
-                cmd << "iconv -f #{ hash[:encoding]} -t UTF-8"
-            end
+        if @hash[:filename] =~ /.gz$/
+            cmd << "gunzip -c"
+        end
 
-            cmd << "egrep \"#{regexp}\""
-            
-            cmd.join(" | ")
+        if @hash[:encoding] and @hash[:encoding] != "UTF-8"
+            cmd << "iconv -f #{ hash[:encoding]} -t UTF-8"
         end
 
+        cmd << "egrep \"#{regexp}\""
+
+        cmd.join(" | ")
     end
 
-else
-    # Pure Ruby
-    class EdictFile
-        def initialize(*args)
-            super(*args)
-            if @hash and @hash[:encoding] != "UTF-8"
-                # FIXME: Find a way to look up words in EUC-JP with reasonable
-                # performance...
-                raise Source::SourceError,
-                      _("Encoding not supported.")
-            end
+end
+
+# Pure Ruby
+class EdictFileRuby < EdictFileBase
+    def initialize(*args)
+        super(*args)
+        if @hash and @hash[:encoding] != "UTF-8"
+            # FIXME: Find a way to look up words in EUC-JP with reasonable
+            # performance...
+            raise Source::SourceError,
+                    _("Encoding not supported.")
         end
+    end
 
-        private
+    private
 
-        def match_with_regexp(regexp)
-            edict_file_open do |file|
-                file.grep(Regexp.new(regexp))
-            end
+    def match_with_regexp(regexp)
+        edict_file_open do |file|
+            file.grep(Regexp.new(regexp))
         end
-
     end
+end
 
-end # if EdictFile::HAVE_EGREP
+class EdictFile < (EdictFileBase::HAVE_EGREP ? EdictFileEgrep : EdictFileRuby)
+    authors ["Mathieu Blondel"]
+    title  _("EDICT file")
+    description _("Look up words in an EDICT file.")
+    license Fantasdic::GPL
+    copyright "Copyright (C) 2007 Mathieu Blondel"
+    no_databases true    
+end
 
 end
 end
+
+Fantasdic::Source::Base.register_source(Fantasdic::Source::EdictFile)

Added: trunk/test/data/edict.eucjp
==============================================================================
--- (empty file)
+++ trunk/test/data/edict.eucjp	Fri Aug 22 18:22:17 2008
@@ -0,0 +1,20 @@
+ó [¤¦¤·¤ç] /(n) tooth decay/
+󤢤´] /(n) chin/jaw/
+󤬤ó(n) niche or alcove for an image/
+ó [¤¬¤ó¦] /(n) Buddhist altar light/
+ó¹Ã[¤«¤á¤³¤¦] /(oK) (n) tortoise shell/
+ó»Ò[¤«¤á¤³] /(oK) (n) (1) young turtle (tortoise)/(2) turtle (tortoise) shell/
+ó¼ê¤«¤á¤Æ /(oK) (n) barnacle/
+ó [¤­¤«¤ó(oK) (n) pattern/example/model/paragon/mirror/
+ó [¤­¤³¤¦] /(oK) (n) tortoise shell/
+ó [¤­¤Ã³¤¦] /(oK) (n) tortoise shell/
+ó [¤«¤á¤·] /(oK) (n) (uk) shield bug/stink bug/
+ó [¤«¤á¤é/(oK) (n) tortoise-shell divination/
+ó [¤­¤Ü¯] /(oK) (n) tortoise-shell divination/
+ó [¤­¤ì] /(oK) (n) crack/crevice/fissure/chap/
+ô¤Þ­] /(oK) (n) (uk) yew plum pine (Podocarpus macrophyllus)/
+ô [¤Ïë¤ë/(adv) from afar/over a great distance/all the way/
+ô [¤Ïë] /(iK) (adj-na,adv,n) far/far away/distant/remote/far off/
+ô¤Ë[¤Ïë¤Ë /(iK) (adv) far off/in the distance/long ago/far/by far/far and away/
+ô [¤Ïë¤ë/(adv) from afar/over a great distance/all the way/
+ô¤ê] /(adj-t,adv-to) cold/

Added: trunk/test/data/edict.eucjp.gz
==============================================================================
Binary file. No diff available.

Added: trunk/test/data/edict.utf8
==============================================================================
--- (empty file)
+++ trunk/test/data/edict.utf8	Fri Aug 22 18:22:17 2008
@@ -0,0 +1,20 @@
+éè [ãããã] /(n) tooth decay/
+é [ãã] /(n) chin/jaw/
+é [ãã] /(n) niche or alcove for an image/
+éç [ãããã] /(n) Buddhist altar light/
+éãç [ããããã] /(oK) (n) tortoise shell/
+éãå [ãããã] /(oK) (n) (1) young turtle (tortoise)/(2) turtle (tortoise) shell/
+éãæ [ãããã] /(oK) (n) barnacle/
+éé [ããã] /(oK) (n) pattern/example/model/paragon/mirror/
+éç [ããã] /(oK) (n) tortoise shell/
+éç [ãããã] /(oK) (n) tortoise shell/
+éè [ãããã] /(oK) (n) (uk) shield bug/stink bug/
+éå [ãããã] /(oK) (n) tortoise-shell divination/
+éå [ããã] /(oK) (n) tortoise-shell divination/
+éè [ããã] /(oK) (n) crack/crevice/fissure/chap/
+æ [ãã] /(oK) (n) (uk) yew plum pine (Podocarpus macrophyllus)/
+éã [ãããã] /(adv) from afar/over a great distance/all the way/
+éã [ããã] /(iK) (adj-na,adv,n) far/far away/distant/remote/far off/
+éãã [ãããã] /(iK) (adv) far off/in the distance/long ago/far/by far/far and away/
+éé [ãããã] /(adv) from afar/over a great distance/all the way/
+å [ãã] /(adj-t,adv-to) cold/

Added: trunk/test/data/edict.utf8.gz
==============================================================================
Binary file. No diff available.

Added: trunk/test/test_edict_file.rb
==============================================================================
--- (empty file)
+++ trunk/test/test_edict_file.rb	Fri Aug 22 18:22:17 2008
@@ -0,0 +1,157 @@
+# Fantasdic
+# Copyright (C) 2008 Mathieu Blondel
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+#Âwith this program; if not, write to the Free Software Foundation, Inc.,
+#Â51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+test_dir = File.expand_path(File.dirname(__FILE__))
+top_dir = File.expand_path(File.join(test_dir, ".."))
+lib_dir = File.expand_path(File.join(top_dir, "lib"))
+$test_data_dir = File.expand_path(File.join(test_dir, "data"))
+$LOAD_PATH.unshift(lib_dir)
+
+require "test/unit"
+require "fantasdic"
+require "fantasdic/sources/edict_file"
+
+$KCODE = "u"
+
+class TestEdictFileSource < Test::Unit::TestCase
+    include Fantasdic::Source
+
+    private
+
+    def test_define(source)
+        defs = source.define("*", "éç")
+        assert_equal(defs.length, 2)
+
+        assert_equal(defs[0].word, "éç")
+        assert_equal(defs[0].body, "éç [ããã] /(oK) (n) tortoise shell/")
+
+        assert_equal(defs[1].word, "éç")
+        assert_equal(defs[1].body, "éç [ãããã] /(oK) (n) tortoise shell/")
+
+        defs = source.define("*", "ããã")
+        assert_equal(defs.length, 1)
+
+        assert_equal(defs[0].word, "ããã")
+        assert_equal(defs[0].body, "éç [ããã] /(oK) (n) tortoise shell/")
+
+        defs = source.define("*", "tortoise")
+        assert_equal(defs.length, 0)
+    end
+
+    def test_match_prefix(source)
+        matches = source.match("*", "prefix", "é")
+        key = matches.keys.first
+        assert_equal(matches,
+                     {key=>["éã", "éã", "éãã", "éé"]})
+
+        matches = source.match("*", "prefix", "ãã")
+        assert_equal(matches,
+                     {key=>["éãç", "éãå", "éãæ", "éè", "éå"]})
+
+        matches = source.match("*", "prefix", "(adv)")
+        assert_equal(matches,
+                     {key=>["(adv) from afar/over a great distance/all the way",
+                         "(adv) from afar/over a great distance/all the way"]})
+
+    end
+
+    def test_match_suffix(source)
+        matches = source.match("*", "suffix", "ç")
+        key = matches.keys.first
+        assert_equal(matches,
+                     {key=>["éãç", "éç", "éç"]})
+
+        matches = source.match("*", "suffix", "ãã")
+        assert_equal(matches,
+                     {key=>["éãç", "éç", "éç"]})
+
+        matches = source.match("*", "suffix", "tion")
+        assert_equal(matches,
+                     {key=>["(oK) (n) tortoise-shell divination",
+                            "(oK) (n) tortoise-shell divination"]})
+    end
+
+    def test_match_word(source)
+        matches = source.match("*", "word", "éç")
+        key = matches.keys.first
+        assert_equal(matches,
+                     {key=>["éç", "éç"]})
+
+        matches = source.match("*", "word", "ããã")
+        assert_equal(matches,
+                     {key=>["éç"]})
+
+        matches = source.match("*", "word", "tortoise")
+        assert_equal(matches,
+                     {key=>["(oK) (n) tortoise shell",
+                           "(oK) (n) tortoise shell",
+                           "(oK) (n) tortoise shell"]})
+    end
+
+    def test_match_substring(source)
+        matches = source.match("*", "substring", "é")
+        key = matches.keys.first
+        assert_equal(matches,
+                     {key=>["éãç", "éãå", "éãæ", "éé",
+                            "éç", "éç", "éè", "éå", "éå", "éè"]})
+
+        matches = source.match("*", "substring", "ããã")
+        assert_equal(matches,
+                     {key=>["éãç", "éãå"]})
+
+        matches = source.match("*", "substring", "-shell")
+        assert_equal(matches,
+                     {key=>["(oK) (n) tortoise-shell divination",
+                            "(oK) (n) tortoise-shell divination"]})
+    end
+
+    public
+
+    utf8 = {:filename => File.join($test_data_dir, "edict.utf8"),
+            :encoding => "UTF-8"}
+    utf8gz = {:filename => File.join($test_data_dir, "edict.utf8.gz"),
+                :encoding => "UTF-8"}
+    eucjp = {:filename => File.join($test_data_dir, "edict.eucjp"),
+                :encoding => "EUC-JP"}
+    eucjpgz = {:filename => File.join($test_data_dir, "edict.eucjp.gz"),
+                :encoding => "EUC-JP"}
+
+    [EdictFileRuby, EdictFileEgrep].each do |klass|
+        [utf8, utf8gz, eucjp, eucjpgz].each do |hash|
+            encoding = hash[:encoding].gsub("-", "").downcase
+
+            # EUC-JP is not supported by EdictFileRuby implementation
+            next if klass == EdictFileRuby and encoding == "eucjp"
+
+            klass_short = klass.to_s.split("::").last.downcase
+            gz = hash[:filename] =~ /gz$/ ? "gz" : "nogz"
+
+            method = "test_#{klass_short}_#{encoding}_#{gz}_define"
+            define_method(method) do
+                send("test_define", klass.new(hash))
+            end
+
+            ["prefix", "suffix", "word", "substring"].each do |match|
+                method = "test_#{klass_short}_#{encoding}_#{gz}_#{match}"
+                define_method(method) do
+                    send("test_match_#{match}", klass.new(hash))
+                end
+            end
+        end
+    end
+
+end



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]