dasher r3615 - in trunk: . Src/DasherCore Src/DasherCore/Alphabet



Author: pwelche
Date: Thu Feb 19 17:06:23 2009
New Revision: 3615
URL: http://svn.gnome.org/viewvc/dasher?rev=3615&view=rev

Log:
Fix loading of UTF-8 training text which for instance caused crashes when
loading Hebrew or Japanese text. Essentially, multibyte sequences on 1023
byte boundaries weren't handled correctly. There is still is redundant code
in place. I tried to decode the UTF-8 into wchar_t, but more and more code
needed to be touched, so stuck to strings of encoded UTF-8. This may well
want to be revisited, but it will be invasive.



Modified:
   trunk/ChangeLog
   trunk/NEWS
   trunk/Src/DasherCore/Alphabet/Alphabet.cpp
   trunk/Src/DasherCore/Alphabet/Alphabet.h
   trunk/Src/DasherCore/TrainingHelper.cpp

Modified: trunk/NEWS
==============================================================================
--- trunk/NEWS	(original)
+++ trunk/NEWS	Thu Feb 19 17:06:23 2009
@@ -1,7 +1,7 @@
 ============
 Dasher 4.9.0
 ============
-  * New development branch
+  * Fix UTF-8 bug so e.g., Japanese and Hebrew work.
 
 ============
 Dasher 4.7.2

Modified: trunk/Src/DasherCore/Alphabet/Alphabet.cpp
==============================================================================
--- trunk/Src/DasherCore/Alphabet/Alphabet.cpp	(original)
+++ trunk/Src/DasherCore/Alphabet/Alphabet.cpp	Thu Feb 19 17:06:23 2009
@@ -124,7 +124,64 @@
 
 /////////////////////////////////////////////////////////////////////////////
 
-void CAlphabet::GetSymbols(std::vector<symbol > *Symbols, std::string *Input, bool IsMore) const {
+CAlphabet::utf8_length::utf8_length()
+{
+  int i;
+
+  memset(utf8_count_array, 0, sizeof(utf8_count_array));
+  for (i = 0x00; i <= 0x7f; ++i) utf8_count_array[i] = 1;
+  for (i = 0xc0; i <= 0xdf; ++i) utf8_count_array[i] = 2;
+  for (i = 0xe0; i <= 0xef; ++i) utf8_count_array[i] = 3;
+  for (i = 0xf0; i <= 0xf7; ++i) utf8_count_array[i] = 4;
+  max_length = 4;
+/* The following would be valid according to RFC 2279 which was rendered
+ * obsolete by RFC 3629
+ * for (i = 0xf8; i <= 0xfb; ++i) utf8_count_array[i] = 5;
+ * for (i = 0xfc; i <= 0xfd; ++i) utf8_count_array[i] = 6;
+ * max_length = 6;
+ *
+ * and from RFC 3629:
+ * o  The octet values C0, C1, F5 to FF never appear.
+ */
+  utf8_count_array[0xc0] = utf8_count_array[0xc1] = 0;
+  for (i = 0xf5; i <= 0xff; ++i) utf8_count_array[i] = 0;
+}
+
+CAlphabet::utf8_length CAlphabet::m_utf8_count_array;
+
+int CAlphabet::utf8_length::operator[](const int i) const
+{
+  return utf8_count_array[i];
+}
+
+void CAlphabet::GetSymbols(std::vector<symbol> &symbols, std::istream &in) const
+{
+  bool unused;
+  char skip, *utfchar = new char[m_utf8_count_array.max_length + 1];
+  symbol sym;
+  int len, ch = in.peek();
+  while (!in.eof())
+    {
+      len = m_utf8_count_array[ch];
+      if (len == 0)
+        {
+          std::cerr << "Read invalid UTF-8 character 0x" << ch << std::endl;
+          in >> skip;
+        }
+      else
+        {
+          in.read(utfchar, len);
+          utfchar[len] = '\0';
+          sym = TextMap.Get(string(utfchar), &unused);
+          symbols.push_back(sym);
+        }
+      ch = in.peek();
+    }
+  delete [] utfchar;
+}
+
+void CAlphabet::GetSymbols(std::vector<symbol> *Symbols, std::string * Input, bool IsMore) const
+{
   string Tmp;
   symbol CurSymbol = 0, TmpSymbol = 0;
   bool KeyIsPrefix = false;

Modified: trunk/Src/DasherCore/Alphabet/Alphabet.h
==============================================================================
--- trunk/Src/DasherCore/Alphabet/Alphabet.h	(original)
+++ trunk/Src/DasherCore/Alphabet/Alphabet.h	Thu Feb 19 17:06:23 2009
@@ -27,6 +27,7 @@
 #include "GroupInfo.h"
 
 #include <cstdlib>
+#include <iostream>
 #include <vector>
 
 namespace Dasher {
@@ -119,6 +120,7 @@
     // text and so a symbol will be returned for a final "a" even if "ae" is
     // defined as its own symbol. }}}
     void GetSymbols(std::vector<symbol> *Symbols, std::string * Input, bool IsMore) const;
+    void GetSymbols(std::vector<symbol> &symbols, std::istream &in) const;
 
 
     /// Look up symbols corresponding to string. Cannot cope with
@@ -187,6 +189,17 @@
     std::string m_GameModeFile;
     std::string m_DefaultPalette;
 
+    class utf8_length
+      {
+        public:
+          utf8_length();
+          int operator[](const int) const;
+          int max_length;
+        private:
+          int utf8_count_array[0x100];
+      };
+    static utf8_length m_utf8_count_array;
+
     // TODO: This is inane
     std::vector < std::string > m_Characters;   // stores the characters
     std::vector < std::string > m_Display;      // stores how the characters are visually represented in the Dasher nodes

Modified: trunk/Src/DasherCore/TrainingHelper.cpp
==============================================================================
--- trunk/Src/DasherCore/TrainingHelper.cpp	(original)
+++ trunk/Src/DasherCore/TrainingHelper.cpp	Thu Feb 19 17:06:23 2009
@@ -24,6 +24,9 @@
 #include <cstdio>
 #include <cstring>
 #include <expat.h>
+#include <fstream>
+#include <ios>
+#include <iostream>
 #include <vector>
 
 //using namespace Dasher;
@@ -63,34 +66,26 @@
 			   Dasher::CTrainer *pTrainer, 
 			   const Dasher::CAlphabet *pAlphabet) {
   
-  if(strFileName == "")
-    return;
-  
-  FILE *pInputFile;
-  if((pInputFile = fopen(strFileName.c_str(), "r")) == (FILE *) 0)
-    return;
-
-  const int iBufferSize = 1024;
-  char szInputBuffer[iBufferSize];
-  std::string strBuffer;
-  std::vector<Dasher::symbol> vSymbols;
-  int iNumberRead;
-
-  do {
-    iNumberRead = fread(szInputBuffer, 1, iBufferSize - 1, pInputFile);
-    szInputBuffer[iNumberRead] = '\0';
-    strBuffer += szInputBuffer;
-
-    bool bIsMore = (iNumberRead == (iBufferSize - 1));
-    
-    vSymbols.clear();
-    pAlphabet->GetSymbols(&vSymbols, &strBuffer, bIsMore);
+  if (strFileName.empty())
+    {
+      std::cerr << "LoadPlain called with empty filename" << std::endl;
+      return;
+    }
 
-    pTrainer->Train(vSymbols);
+  std::ifstream in(strFileName.c_str(), std::ios::binary);
+  if (in.bad())
+    {
+      std::cerr << "Unable to open file \"" << strFileName << "\" for reading"
+                << std::endl;
+      return;
+    }
 
-  } while(iNumberRead == iBufferSize - 1); 
+  std::vector<Dasher::symbol> vSymbols;
+  vSymbols.clear();
+  pAlphabet->GetSymbols(vSymbols, in);
+  pTrainer->Train(vSymbols);
 
-  fclose(pInputFile);
+  in.close();
 }
 
 void 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]