[dasher] Refactored training code, moving training code from CAlphabet into



commit fa5067058330e0809998cf8c85c920bddb0b234c
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Fri Aug 7 15:02:40 2009 +0200

    Refactored training code, moving training code from CAlphabet into
    CTrainer, which is now a subclass of CTrainingHelper and in its
    own file. Mandarin training code thus separated out into its own
    subclass, CMandarinTrainer. (3-Jul-2009)

 ChangeLog                                   |    1 +
 Src/DasherCore/Alphabet/Alphabet.cpp        |   39 ------
 Src/DasherCore/Alphabet/Alphabet.h          |   19 +---
 Src/DasherCore/AlphabetManagerFactory.cpp   |  132 +--------------------
 Src/DasherCore/AlphabetManagerFactory.h     |   26 +----
 Src/DasherCore/DasherInterfaceBase.cpp      |    3 -
 Src/DasherCore/DasherInterfaceBase.h        |    1 -
 Src/DasherCore/Makefile.am                  |    2 +
 Src/DasherCore/NodeCreationManager.cpp      |   15 +--
 Src/DasherCore/Trainer.cpp                  |  169 +++++++++++++++++++++++++++
 Src/DasherCore/Trainer.h                    |   35 ++++++
 Src/DasherCore/TrainingHelper.cpp           |   29 ++---
 Src/DasherCore/TrainingHelper.h             |   33 +++---
 Src/MacOSX/Dasher.xcodeproj/project.pbxproj |    8 ++
 14 files changed, 252 insertions(+), 260 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 7e20c5a..bf05023 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,7 @@
 
 	* MacOSX build now includes Mandarin Dasher training texts,
 	  PinYinConversionHelper.
+	* Move training code from CAlphabet into CTrainer.
 
 2009-08-06  Patrick Welche <prlw1 cam ac uk>
 
diff --git a/Src/DasherCore/Alphabet/Alphabet.cpp b/Src/DasherCore/Alphabet/Alphabet.cpp
index 7490104..40deb9a 100644
--- a/Src/DasherCore/Alphabet/Alphabet.cpp
+++ b/Src/DasherCore/Alphabet/Alphabet.cpp
@@ -50,8 +50,6 @@ CAlphabet::CAlphabet()
   m_Display.push_back("");
   m_Colours.push_back(-1);
   m_Foreground.push_back("");
-
-  m_pTrainingHelper = new CTrainingHelper;
 }
 
 /////////////////////////////////////////////////////////////////////////////
@@ -66,8 +64,6 @@ CAlphabet::CAlphabet(const CAlphIO::AlphInfo &AlphInfo)
   m_StartConversionSymbol = -1;
   m_EndConversionSymbol = -1;
 
-  m_pTrainingHelper = NULL;
-
   m_strDefaultContext = AlphInfo.m_strDefaultContext;
 
   // Set miscellaneous options
@@ -122,10 +118,6 @@ CAlphabet::CAlphabet(const CAlphIO::AlphInfo &AlphInfo)
 #endif
 }
 
-CAlphabet::~CAlphabet() {
-  delete m_pTrainingHelper;
-}
-
 /////////////////////////////////////////////////////////////////////////////
 
 CAlphabet::utf8_length::utf8_length()
@@ -375,37 +367,6 @@ int CAlphabet::GetTextColour(symbol Symbol) {
   }
 }
 
-
-void 
-CAlphabet::Train(const std::string &strUserLoc, 
-		 const std::string &strSystemLoc, 
-		 CTrainer *pTrainer) {
-
-  std::string strTrainingFile = GetTrainingFile();
-
-  if (strTrainingFile.empty()) {
-#ifdef DEBUG
-    std::cerr << "Trying to load empty training file (location)" << std::endl;
-#endif
-  } else {
-    m_pTrainingHelper->LoadFile(strUserLoc   + strTrainingFile, pTrainer, this);
-    m_pTrainingHelper->LoadFile(strSystemLoc + strTrainingFile, pTrainer, this);
-  }
-}
-
-void 
-CAlphabet::Train(const std::string &strPath, 
-      CTrainer *pTrainer) {
-  
-  if (strPath.empty()) {
-#ifdef DEBUG
-    std::cerr << "Trying to load empty training file (path)" << std::endl;
-#endif
-  } else {
-    m_pTrainingHelper->LoadFile(strPath, pTrainer, this);
-  }
-}
-
 int 
 CAlphabet::GetColour(symbol i, int iPhase) const {
   int iColour = m_Colours[i];
diff --git a/Src/DasherCore/Alphabet/Alphabet.h b/Src/DasherCore/Alphabet/Alphabet.h
index 7a8b885..32942a6 100644
--- a/Src/DasherCore/Alphabet/Alphabet.h
+++ b/Src/DasherCore/Alphabet/Alphabet.h
@@ -31,10 +31,6 @@
 #include <vector>
 
 namespace Dasher {
-  // Forward declarations
-  class CTrainer;
-  class CTrainingHelper;
-
   ///
   /// \defgroup Alphabet Alphabet information
   /// @{
@@ -44,9 +40,6 @@ namespace Dasher {
     CAlphabet();
     CAlphabet(const CAlphIO::AlphInfo & AlphInfo);
 
-    ~CAlphabet();
-
-
     // Return size of alphabet, including control symbols
     int GetNumberSymbols() const {
       return m_Characters.size();
@@ -69,7 +62,7 @@ namespace Dasher {
       return m_DefaultEncoding;
     }
 
-    std::string & GetTrainingFile() {
+    const std::string & GetTrainingFile() const {
       return m_TrainingFile;
     }
     std::string GetGameModeFile() {
@@ -167,13 +160,6 @@ namespace Dasher {
     }
 
     SGroupInfo *m_pBaseGroup;
-
-    void Train(const std::string &strUserLoc, 
-	       const std::string &strSystemLoc, 
-	       CTrainer *pTrainer);
-
-    void Train(const std::string &strPath, 
-	       CTrainer *pTrainer);
     
   private:
 
@@ -207,9 +193,6 @@ namespace Dasher {
     std::vector < std::string > m_Foreground;   // stores the colour of the character foreground
     // ----
 
-    CTrainingHelper *m_pTrainingHelper;
-
-
     SGroupInfo *pFirstGroup;
 
     alphabet_map TextMap;
diff --git a/Src/DasherCore/AlphabetManagerFactory.cpp b/Src/DasherCore/AlphabetManagerFactory.cpp
index f96c257..83deadf 100644
--- a/Src/DasherCore/AlphabetManagerFactory.cpp
+++ b/Src/DasherCore/AlphabetManagerFactory.cpp
@@ -70,7 +70,7 @@ CAlphabetManagerFactory::CAlphabetManagerFactory(CDasherInterfaceBase *pInterfac
     chalphabet.SetAlphabetPointer(m_pCHAlphabet);
     //std::cout<<"CHALphabet size "<<chalphabet.GetSize(); [7603]
     m_pLanguageModel = new CPPMPYLanguageModel(pEventHandler, pSettingsStore, chalphabet, alphabet);
-
+	m_pTrainer = new CMandarinTrainer(m_pLanguageModel, m_pAlphabet, m_pCHAlphabet);
     std::cout<<"Setting PPMPY model"<<std::endl;
   }
   else{
@@ -97,6 +97,7 @@ CAlphabetManagerFactory::CAlphabetManagerFactory(CDasherInterfaceBase *pInterfac
       m_pLanguageModel = new CPPMLanguageModel(pEventHandler, pSettingsStore, alphabet);    
       break;
     }
+    m_pTrainer = new CTrainer(m_pLanguageModel, m_pAlphabet);
   }
 
   m_iLearnContext = m_pLanguageModel->CreateEmptyContext();
@@ -116,132 +117,3 @@ CAlphabetManagerFactory::~CAlphabetManagerFactory() {
 CDasherNode *CAlphabetManagerFactory::GetRoot(CDasherNode *pParent, int iLower, int iUpper, void *pUserData) {
   return m_pAlphabetManager->GetRoot(pParent, iLower, iUpper, pUserData);
 }
-
-CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet) {
-  m_pLanguageModel = pLanguageModel;
-  m_pAlphabet = pAlphabet;
-  m_pCHAlphabet = pCHAlphabet;
-  m_Context = m_pLanguageModel->CreateEmptyContext();
-}
-
-void CTrainer::Train(const std::vector<symbol> &vSymbols) {
-
-  for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
-    m_pLanguageModel->LearnSymbol(m_Context, *it);
-  }
-}
-
-//TrainMandarin is used to train Mandarin Dasher: PPMPYLanguageModel
-//Mandarin training is distinct from normal PPM training in that it uses two separate alphabets, and trains with py-character pairs. Despite so, implementation here may seem out of structure, and it could be necessary to revise later, particularly on robustness to deal with non-unicode chars
-//The training of Mandarin Dasher may evolve in to possible paths: 1.Include punctuation (more work); 2.User defined training files (not sure how); 3.Learning as one types (more work)
-//As Manager is produced, training happens in AlphabetManagerFactory
-
-void CTrainer::TrainMandarin(const std::string &strUserLoc, const std::string &strSystemLoc){
-
-  //TrainMandarin takes in the Super Pin Yin Alphabet, and uses the Mandarin Character alphabet stored in private AlphabetManagerFactory
-
-  std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
-
-  std::string strUserPath = strUserLoc + strTrainingFile;
-  std::string strSystemPath = strSystemLoc + strTrainingFile;
-
-  FILE * fpUser = fopen (strUserPath.c_str(), "rb");
-  FILE * fpSystem = fopen(strSystemPath.c_str(), "rb");
-  FILE * fpTrain = fpSystem;
-  
-  if(!fpTrain) {
-
-    fpTrain = fpUser;
-    if(!fpTrain){
-      printf("Mandarin Training File: cannot open file or incorrect directory\n");
-    return;
-    }
-  }
-  unsigned numberofchar = 0;
-
-
-  size_t charsize = 1024;
-    
-  size_t trainBufferSize = 3*charsize*3;
-  char szBuffer[trainBufferSize];
-    
-  std::string strChar;
-  std::string strPY;
-  char ctemp[4];
-  CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
-  std::string pyID = "ã??";
-  std::vector<symbol> Symchar;
-  std::vector<symbol> Sympy;
-
-  while(!feof(fpTrain)){
-    
-    strPY.clear();
-    strChar.clear();
- 
-    size_t iNumBytes = fread(szBuffer, 1, trainBufferSize, fpTrain);
-    std::string strBuffer = std::string(szBuffer, iNumBytes);
-
-    size_t lim;
-    if(iNumBytes<9*charsize)
-      lim = iNumBytes/9;
-    else
-      lim = charsize;
-    
-    size_t pos =0;//position in 3's counting on 
-    while(pos<lim*3){
-
-      while(pyID.compare(strBuffer.substr(3*pos,3))!=0)
-	pos++;
-      
-      pos++;
-      //      strBuffer.copy(ctemp,3,3*pos);
-      
-      strPY.append(strBuffer.substr(3*pos,3));
- 
-      pos++;
- 
-      //strBuffer.copy(ctemp,3,3*pos);
-      strChar.append(strBuffer.substr(3*pos,3));
-      std::string strtemp = strBuffer.substr(3*(pos),3);
-      Symchar.clear();
-      m_pCHAlphabet->GetSymbols(&Symchar, &strtemp, 0);
-
-      pos++;
-          
-    }
-    Symchar.clear();
-    Sympy.clear();
-    m_pCHAlphabet->GetSymbols(&Symchar, &strChar, 0);
-    m_pAlphabet->GetSymbols(&Sympy, &strPY, 0);      
-    
-    for(int i =0; i<Symchar.size(); i++){
-
-      if((Symchar[i]<7603)&&(Symchar[i]>-1)){//Hack here? to prevent lan model from failing
-	
-	static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[i]); 
-	m_pLanguageModel->LearnSymbol(trainContext, Symchar[i]);
-	
-      }
-
-      // if(Sym.size()>0)
-      
-      numberofchar = numberofchar + Symchar.size();     
-    }       
-    
-  }
-  //std::cout<<"The Length of Training file is  "<<numberofchar<<" bytes/py characters"<<std::endl;  
-}
-
-
-void CTrainer::Reset() {
-  m_pLanguageModel->ReleaseContext(m_Context);
-  m_Context = m_pLanguageModel->CreateEmptyContext();
-}
-
-CTrainer::~CTrainer() {
-  m_pLanguageModel->ReleaseContext(m_Context);
-}
-
-CTrainer *CAlphabetManagerFactory::GetTrainer() {
-  return new CTrainer(m_pLanguageModel, m_pAlphabet, m_pCHAlphabet);
-}
diff --git a/Src/DasherCore/AlphabetManagerFactory.h b/Src/DasherCore/AlphabetManagerFactory.h
index ddf9da7..f9002be 100644
--- a/Src/DasherCore/AlphabetManagerFactory.h
+++ b/Src/DasherCore/AlphabetManagerFactory.h
@@ -3,6 +3,7 @@
 
 #include "AlphabetManager.h"
 #include "LanguageModelling/LanguageModel.h"
+#include "Trainer.h"
 
 class CNodeCreationManager;
 
@@ -11,26 +12,6 @@ namespace Dasher {
   class CAlphIO;
   class CDasherInterfaceBase;
 
-  // TODO: Move this into a new file
-  class CTrainer {
-  public:
-    CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet);
-    
-    void Train(const std::vector < symbol > &vSymbols);
-    void TrainMandarin(const std::string &strUserLoc,
-		       const std::string &strSystemLoc);
-	       
-    void Reset();
-    
-    ~CTrainer();
-    
-  private:
-    CLanguageModel *m_pLanguageModel;
-    CLanguageModel::Context m_Context;
-    CAlphabet * m_pAlphabet;
-    CAlphabet * m_pCHAlphabet;
-  };
-
   /// \ingroup Model
   /// @{
   class CAlphabetManagerFactory {
@@ -72,7 +53,7 @@ namespace Dasher {
       return m_iConversionID;
     };
 
-    CTrainer *GetTrainer();
+	CTrainer *GetTrainer() {return m_pTrainer;}
 
   private:
     CAlphabetManager *m_pAlphabetManager;
@@ -81,7 +62,8 @@ namespace Dasher {
     CLanguageModel::Context m_iLearnContext; // Used to add data to model as it is entered
     CAlphabet *m_pAlphabet;        // pointer to the alphabet
     CAlphabet *m_pCHAlphabet;      // pointer to the Mandarin alphabet
-
+	CTrainer *m_pTrainer;
+	  
     int m_iConversionID;
   };
   /// @}  
diff --git a/Src/DasherCore/DasherInterfaceBase.cpp b/Src/DasherCore/DasherInterfaceBase.cpp
index f5c99bc..cc189cd 100644
--- a/Src/DasherCore/DasherInterfaceBase.cpp
+++ b/Src/DasherCore/DasherInterfaceBase.cpp
@@ -113,8 +113,6 @@ CDasherInterfaceBase::CDasherInterfaceBase() {
 
   m_bLastChanged = true;
 
-  //  m_pTrainingHelper = new CTrainingHelper;
-
 #ifndef _WIN32_WCE
   // Global logging object we can use from anywhere
   g_pLogger = new CFileLogger("dasher.log",
@@ -207,7 +205,6 @@ CDasherInterfaceBase::~CDasherInterfaceBase() {
   delete m_ColourIO;
   delete m_AlphIO;
   delete m_pNCManager;
-  //  delete m_pTrainingHelper;
   // Do NOT delete Edit box or Screen. This class did not create them.
 
 #ifndef _WIN32_WCE
diff --git a/Src/DasherCore/DasherInterfaceBase.h b/Src/DasherCore/DasherInterfaceBase.h
index 2c9c9b9..1e63f2f 100644
--- a/Src/DasherCore/DasherInterfaceBase.h
+++ b/Src/DasherCore/DasherInterfaceBase.h
@@ -587,7 +587,6 @@ protected:
   CColourIO *m_ColourIO;
   CNodeCreationManager *m_pNCManager;
   CUserLogBase *m_pUserLog; 
-  //  CTrainingHelper *m_pTrainingHelper;
   /// @}
 
   std::string strTrainfileBuffer;
diff --git a/Src/DasherCore/Makefile.am b/Src/DasherCore/Makefile.am
index 6ae670f..bd16f84 100644
--- a/Src/DasherCore/Makefile.am
+++ b/Src/DasherCore/Makefile.am
@@ -110,6 +110,8 @@ libdashercore_a_SOURCES = \
 		StylusFilter.h \
 		TimeSpan.cpp \
 		TimeSpan.h \
+		Trainer.cpp \
+		Trainer.h \
 		TrainingHelper.cpp \
 		TrainingHelper.h \
 		TwoBoxStartHandler.cpp \
diff --git a/Src/DasherCore/NodeCreationManager.cpp b/Src/DasherCore/NodeCreationManager.cpp
index 2f91bb2..604902b 100644
--- a/Src/DasherCore/NodeCreationManager.cpp
+++ b/Src/DasherCore/NodeCreationManager.cpp
@@ -22,21 +22,14 @@ CNodeCreationManager::CNodeCreationManager(Dasher::CDasherInterfaceBase *pInterf
   m_pLanguageModel = m_pAlphabetManagerFactory->GetLanguageModel();
   m_pAlphabet = m_pAlphabetManagerFactory->GetAlphabet();
 
-  int iConversionID(m_pAlphabetManagerFactory->GetConversionID());
-
   // Train the language model
   CTrainer *pTrainer =  m_pAlphabetManagerFactory->GetTrainer();
 
-  //WZ: Mandarin Dasher Change
-  if((iConversionID==2)&&(pSettingsStore->GetStringParameter(SP_ALPHABET_ID)=="Chinese Super Pin Yin, grouped by Dictionary"))
-    pTrainer->TrainMandarin(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC));
-  else{
-    //End Mandarin Dasher Change  
-    m_pAlphabet->Train(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC), pTrainer);
-  }
+  pTrainer->Train(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC));
+  
   delete pTrainer;
 
-
+  int iConversionID(m_pAlphabetManagerFactory->GetConversionID());
 
 #ifndef _WIN32_WCE
   m_pControlManager = new CControlManager(this);
@@ -194,7 +187,7 @@ CNodeCreationManager::ImportTrainingText(const std::string &strPath) {
     pTrainer = m_pAlphabetManagerFactory->GetTrainer();
 
   if(m_pAlphabet && pTrainer)
-    m_pAlphabet->Train(strPath, pTrainer);
+	pTrainer->Train(strPath);
 
   delete pTrainer;
 }
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
new file mode 100644
index 0000000..a2505e2
--- /dev/null
+++ b/Src/DasherCore/Trainer.cpp
@@ -0,0 +1,169 @@
+
+#include "../Common/Common.h"
+
+#include "Trainer.h"
+#include "DasherInterfaceBase.h"
+#include "LanguageModelling/PPMLanguageModel.h"
+#include "LanguageModelling/WordLanguageModel.h"
+#include "LanguageModelling/DictLanguageModel.h"
+#include "LanguageModelling/MixtureLanguageModel.h"
+#include "LanguageModelling/CTWLanguageModel.h"
+#include "LanguageModelling/PPMPYLanguageModel.h"
+
+using namespace Dasher;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG_MEMLEAKS
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet)
+  : CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel) {
+}
+
+void CTrainer::Train(const std::string &strUserLoc, 
+				 const std::string &strSystemLoc) {
+	
+	std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
+	
+	if (strTrainingFile.empty()) {
+#ifdef DEBUG
+		std::cerr << "Trying to load empty training file (location)" << std::endl;
+#endif
+	} else {
+		LoadFile(strUserLoc   + strTrainingFile);
+		LoadFile(strSystemLoc + strTrainingFile);
+	}
+}
+
+
+void 
+CTrainer::Train(const std::string &strPath) {
+	
+	if (strPath.empty()) {
+#ifdef DEBUG
+		std::cerr << "Trying to load empty training file (path)" << std::endl;
+#endif
+	} else {
+		LoadFile(strPath);
+	}
+}
+
+void CTrainer::Train(const std::vector<symbol> &vSymbols) {
+  CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
+
+  for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
+      m_pLanguageModel->LearnSymbol(sContext, *it);
+  }
+  m_pLanguageModel->ReleaseContext(sContext);
+}
+
+CMandarinTrainer::CMandarinTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet)
+: CTrainer(pLanguageModel, pAlphabet), m_pCHAlphabet(pCHAlphabet) {
+}
+
+//TrainMandarin is used to train Mandarin Dasher: PPMPYLanguageModel
+//Mandarin training is distinct from normal PPM training in that it uses two separate alphabets, and trains with py-character pairs. Despite so, implementation here may seem out of structure, and it could be necessary to revise later, particularly on robustness to deal with non-unicode chars
+//The training of Mandarin Dasher may evolve in to possible paths: 1.Include punctuation (more work); 2.User defined training files (not sure how); 3.Learning as one types (more work)
+//As Manager is produced, training happens in AlphabetManagerFactory
+
+void CMandarinTrainer::Train(const std::string &strUserLoc, const std::string &strSystemLoc){
+
+  //TrainMandarin takes in the Super Pin Yin Alphabet, and uses the Mandarin Character alphabet stored in private AlphabetManagerFactory
+
+  std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
+
+  std::string strUserPath = strUserLoc + strTrainingFile;
+  std::string strSystemPath = strSystemLoc + strTrainingFile;
+
+  FILE * fpUser = fopen (strUserPath.c_str(), "rb");
+  FILE * fpSystem = fopen(strSystemPath.c_str(), "rb");
+  FILE * fpTrain = fpSystem;
+  
+  if(!fpTrain) {
+
+    fpTrain = fpUser;
+    if(!fpTrain){
+      printf("Mandarin Training File: cannot open file or incorrect directory\n");
+    return;
+    }
+  }
+  unsigned numberofchar = 0;
+
+
+  size_t charsize = 1024;
+    
+  size_t trainBufferSize = 3*charsize*3;
+  char szBuffer[trainBufferSize];
+    
+  std::string strChar;
+  std::string strPY;
+  //char ctemp[4];
+  CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
+  std::string pyID = "ã??";
+  std::vector<symbol> Symchar;
+  std::vector<symbol> Sympy;
+
+  while(!feof(fpTrain)){
+    
+    strPY.clear();
+    strChar.clear();
+ 
+    size_t iNumBytes = fread(szBuffer, 1, trainBufferSize, fpTrain);
+    std::string strBuffer = std::string(szBuffer, iNumBytes);
+
+    size_t lim;
+    if(iNumBytes<9*charsize)
+      lim = iNumBytes/9;
+    else
+      lim = charsize;
+    
+    size_t pos =0;//position in 3's counting on 
+    while(pos<lim*3){
+
+      while(pyID.compare(strBuffer.substr(3*pos,3))!=0)
+	pos++;
+      
+      pos++;
+      //      strBuffer.copy(ctemp,3,3*pos);
+      
+      strPY.append(strBuffer.substr(3*pos,3));
+ 
+      pos++;
+ 
+      //strBuffer.copy(ctemp,3,3*pos);
+      strChar.append(strBuffer.substr(3*pos,3));
+      std::string strtemp = strBuffer.substr(3*(pos),3);
+      Symchar.clear();
+      m_pCHAlphabet->GetSymbols(&Symchar, &strtemp, 0);
+
+      pos++;
+          
+    }
+    Symchar.clear();
+    Sympy.clear();
+    m_pCHAlphabet->GetSymbols(&Symchar, &strChar, 0);
+    m_pAlphabet->GetSymbols(&Sympy, &strPY, 0);      
+    
+    for(int i =0; i<Symchar.size(); i++){
+
+      if((Symchar[i]<7603)&&(Symchar[i]>-1)){//Hack here? to prevent lan model from failing
+	
+	static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[i]); 
+	m_pLanguageModel->LearnSymbol(trainContext, Symchar[i]);
+	
+      }
+
+      // if(Sym.size()>0)
+      
+      numberofchar = numberofchar + Symchar.size();     
+    }       
+    
+  }
+  //std::cout<<"The Length of Training file is  "<<numberofchar<<" bytes/py characters"<<std::endl;  
+}
\ No newline at end of file
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
new file mode 100644
index 0000000..fbefe6b
--- /dev/null
+++ b/Src/DasherCore/Trainer.h
@@ -0,0 +1,35 @@
+#ifndef __trainer_h__
+#define __trainer_h__
+
+#include "LanguageModelling/LanguageModel.h"
+#include "TrainingHelper.h"
+
+namespace Dasher {
+  class CAlphabet;
+  class CAlphIO;
+  class CDasherInterfaceBase;
+	
+  class CTrainer : public CTrainingHelper {
+  public:
+    CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet);
+    void Train(const std::string &strUserLoc, const std::string &strSystemLoc);
+	void Train(const std::string &strPath);
+
+  protected:
+	virtual void Train(const std::vector<symbol> &vSymbols);
+	CLanguageModel *m_pLanguageModel;
+  };
+	
+  class CMandarinTrainer : public CTrainer {
+  public:
+    CMandarinTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet);
+    
+    virtual void Train(const std::string &strUserLoc, const std::string &strSystemLoc);
+    
+  private:
+    CAlphabet * m_pCHAlphabet;
+  };
+
+}
+
+#endif
diff --git a/Src/DasherCore/TrainingHelper.cpp b/Src/DasherCore/TrainingHelper.cpp
index 325de46..4dd2719 100644
--- a/Src/DasherCore/TrainingHelper.cpp
+++ b/Src/DasherCore/TrainingHelper.cpp
@@ -35,10 +35,11 @@ static void XML_StartElement(void *pUserData, const XML_Char *szName, const XML_
 static void XML_EndElement(void *pUserData, const XML_Char *szName);
 static void XML_CharacterData(void *pUserData, const XML_Char *szS, int iLen);
 
+Dasher::CTrainingHelper::CTrainingHelper(const Dasher::CAlphabet *pAlphabet) : m_pAlphabet(pAlphabet) {
+}
+
 void 
-Dasher::CTrainingHelper::LoadFile(const std::string &strFileName, 
-				  Dasher::CTrainer *pTrainer, 
-				  const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadFile(const std::string &strFileName) {
    if(strFileName == "")
     return;
 
@@ -54,17 +55,15 @@ Dasher::CTrainingHelper::LoadFile(const std::string &strFileName,
    fclose(pInputFile);
 
    if(!strcmp(szTestBuffer, "<?xml")) {
-     LoadXML(strFileName, pTrainer, pAlphabet);
+	 LoadXML(strFileName);
    }
    else {
-     LoadPlain(strFileName, pTrainer, pAlphabet);
+	LoadPlain(strFileName);
    }
 }
 
 void
-Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName, 
-			   Dasher::CTrainer *pTrainer, 
-			   const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName) {
   
   std::ifstream in(strFileName.c_str(), std::ios::binary);
   if (in.fail())
@@ -76,20 +75,16 @@ Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName,
 
   std::vector<Dasher::symbol> vSymbols;
   vSymbols.clear();
-  pAlphabet->GetSymbols(vSymbols, in);
-  pTrainer->Train(vSymbols);
+  m_pAlphabet->GetSymbols(vSymbols, in);
+  Train(vSymbols);
 
   in.close();
 }
 
 void 
-Dasher::CTrainingHelper::LoadXML(const std::string &strFileName, 
-			 Dasher::CTrainer *pTrainer, 
-			 const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadXML(const std::string &strFileName) {
 
   m_bInSegment = false;
-  m_pAlphabet = pAlphabet;
-  m_pTrainer = pTrainer;
 
   FILE *pInput;
   if((pInput = fopen(strFileName.c_str(), "r")) == (FILE *) 0) {
@@ -134,11 +129,9 @@ Dasher::CTrainingHelper::HandleStartElement(const XML_Char *szName,
 void 
 Dasher::CTrainingHelper::HandleEndElement(const XML_Char *szName) {
   if(!strcmp(szName, "segment")) {
-    m_pTrainer->Reset();
-
     std::vector<Dasher::symbol> vSymbols;
     m_pAlphabet->GetSymbols(&vSymbols, &m_strCurrentText, false);
-    m_pTrainer->Train(vSymbols);
+    Train(vSymbols);
     
     m_bInSegment = false;
   }
diff --git a/Src/DasherCore/TrainingHelper.h b/Src/DasherCore/TrainingHelper.h
index f020b28..7ec7594 100644
--- a/Src/DasherCore/TrainingHelper.h
+++ b/Src/DasherCore/TrainingHelper.h
@@ -22,8 +22,6 @@
 #define __TrainingHelper_h__
 
 #include "Alphabet/Alphabet.h"
-#include "AlphabetManagerFactory.h"
-#include "TrainingHelper.h"
 
 #include <string>
 
@@ -31,10 +29,8 @@ namespace Dasher {
 
   class CTrainingHelper {
   public:
-    void LoadFile(const std::string &strFileName, 
-		  Dasher::CTrainer *pTrainer, 
-		  const Dasher::CAlphabet *pAlphabet);
-    
+	CTrainingHelper(const CAlphabet *m_pAlphabet);
+	  
     void HandleStartElement(const XML_Char *szName, 
 			    const XML_Char **pAtts);
   
@@ -42,20 +38,21 @@ namespace Dasher {
     
     void HandleCData(const XML_Char *szS, 
 		     int iLen);
-    
-  private:
-    void LoadPlain(const std::string &strFileName, 
-		   Dasher::CTrainer *pTrainer, 
-		   const Dasher::CAlphabet *pAlphabet);
-    
-    void LoadXML(const std::string &strFileName, 
-		 Dasher::CTrainer *pTrainer,
-		 const Dasher::CAlphabet *pAlphabet);
-    
+
+  protected:
+	void LoadFile(const std::string &strFileName);
+
+	const Dasher::CAlphabet *m_pAlphabet;
+
+	virtual void Train(const std::vector<symbol> &symbols)=0;
+	  
+  private:  
+	void LoadPlain(const std::string &strFileName);
+	  
+	void LoadXML(const std::string &strFileName);
+	  
     bool m_bInSegment;
     std::string m_strCurrentText;
-    Dasher::CTrainer *m_pTrainer;
-    const Dasher::CAlphabet *m_pAlphabet;
   };
 };
 
diff --git a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
index 3a8b239..ed877c4 100755
--- a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
+++ b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
@@ -356,6 +356,8 @@
 		19F8C7FA0C858E9900276B4F /* TrainingHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 19F8C7F80C858E9900276B4F /* TrainingHelper.h */; };
 		3306E0220FFD1CE60017324C /* PPMPYLanguageModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3306E0200FFD1CE60017324C /* PPMPYLanguageModel.cpp */; };
 		3306E0230FFD1CE60017324C /* PPMPYLanguageModel.h in Headers */ = {isa = PBXBuildFile; fileRef = 3306E0210FFD1CE60017324C /* PPMPYLanguageModel.h */; };
+		3306E1F70FFE6CAD0017324C /* Trainer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3306E1F50FFE6CAD0017324C /* Trainer.cpp */; };
+		3306E1F80FFE6CAD0017324C /* Trainer.h in Headers */ = {isa = PBXBuildFile; fileRef = 3306E1F60FFE6CAD0017324C /* Trainer.h */; };
 		335DB0FB100B332C006DB155 /* alphabet.spyDict.xml in Resources */ = {isa = PBXBuildFile; fileRef = 335DB0FA100B332C006DB155 /* alphabet.spyDict.xml */; };
 		335DB101100B3358006DB155 /* training_spyDict.txt in Resources */ = {isa = PBXBuildFile; fileRef = 335DB100100B3358006DB155 /* training_spyDict.txt */; };
 		335DB122100B3606006DB155 /* PinYinConversionHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1948BE760C226CFD001DFA32 /* PinYinConversionHelper.cpp */; };
@@ -772,6 +774,8 @@
 		29B97325FDCFA39411CA2CEA /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = /System/Library/Frameworks/Foundation.framework; sourceTree = "<absolute>"; };
 		3306E0200FFD1CE60017324C /* PPMPYLanguageModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PPMPYLanguageModel.cpp; sourceTree = "<group>"; };
 		3306E0210FFD1CE60017324C /* PPMPYLanguageModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PPMPYLanguageModel.h; sourceTree = "<group>"; };
+		3306E1F50FFE6CAD0017324C /* Trainer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Trainer.cpp; sourceTree = "<group>"; };
+		3306E1F60FFE6CAD0017324C /* Trainer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Trainer.h; sourceTree = "<group>"; };
 		335DB0FA100B332C006DB155 /* alphabet.spyDict.xml */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; path = alphabet.spyDict.xml; sourceTree = "<group>"; };
 		335DB100100B3358006DB155 /* training_spyDict.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = training_spyDict.txt; sourceTree = "<group>"; };
 		33ABFEC40FC379EA00EA2BA5 /* ButtonMultiPress.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ButtonMultiPress.cpp; sourceTree = "<group>"; };
@@ -913,6 +917,8 @@
 		1948BDF40C226CFC001DFA32 /* DasherCore */ = {
 			isa = PBXGroup;
 			children = (
+				3306E1F50FFE6CAD0017324C /* Trainer.cpp */,
+				3306E1F60FFE6CAD0017324C /* Trainer.h */,
 				33FC93420FEFA2FB00A9F08D /* FrameRate.cpp */,
 				33FC93370FEFA2C900A9F08D /* TwoPushDynamicFilter.cpp */,
 				33FC93380FEFA2C900A9F08D /* TwoPushDynamicFilter.h */,
@@ -1500,6 +1506,7 @@
 				33ABFEC70FC379EA00EA2BA5 /* ButtonMultiPress.h in Headers */,
 				33FC933A0FEFA2C900A9F08D /* TwoPushDynamicFilter.h in Headers */,
 				3306E0230FFD1CE60017324C /* PPMPYLanguageModel.h in Headers */,
+				3306E1F80FFE6CAD0017324C /* Trainer.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -1848,6 +1855,7 @@
 				33FC93430FEFA2FB00A9F08D /* FrameRate.cpp in Sources */,
 				3306E0220FFD1CE60017324C /* PPMPYLanguageModel.cpp in Sources */,
 				335DB122100B3606006DB155 /* PinYinConversionHelper.cpp in Sources */,
+				3306E1F70FFE6CAD0017324C /* Trainer.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]