[dasher] Refactored training code, moving training code from CAlphabet into
- From: Patrick Welche <pwelche src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [dasher] Refactored training code, moving training code from CAlphabet into
- Date: Fri, 7 Aug 2009 21:01:57 +0000 (UTC)
commit fa5067058330e0809998cf8c85c920bddb0b234c
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Fri Aug 7 15:02:40 2009 +0200
Refactored training code, moving training code from CAlphabet into
CTrainer, which is now a subclass of CTrainingHelper and in its
own file. Mandarin training code thus separated out into its own
subclass, CMandarinTrainer. (3-Jul-2009)
ChangeLog | 1 +
Src/DasherCore/Alphabet/Alphabet.cpp | 39 ------
Src/DasherCore/Alphabet/Alphabet.h | 19 +---
Src/DasherCore/AlphabetManagerFactory.cpp | 132 +--------------------
Src/DasherCore/AlphabetManagerFactory.h | 26 +----
Src/DasherCore/DasherInterfaceBase.cpp | 3 -
Src/DasherCore/DasherInterfaceBase.h | 1 -
Src/DasherCore/Makefile.am | 2 +
Src/DasherCore/NodeCreationManager.cpp | 15 +--
Src/DasherCore/Trainer.cpp | 169 +++++++++++++++++++++++++++
Src/DasherCore/Trainer.h | 35 ++++++
Src/DasherCore/TrainingHelper.cpp | 29 ++---
Src/DasherCore/TrainingHelper.h | 33 +++---
Src/MacOSX/Dasher.xcodeproj/project.pbxproj | 8 ++
14 files changed, 252 insertions(+), 260 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 7e20c5a..bf05023 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -2,6 +2,7 @@
* MacOSX build now includes Mandarin Dasher training texts,
PinYinConversionHelper.
+ * Move training code from CAlphabet into CTrainer.
2009-08-06 Patrick Welche <prlw1 cam ac uk>
diff --git a/Src/DasherCore/Alphabet/Alphabet.cpp b/Src/DasherCore/Alphabet/Alphabet.cpp
index 7490104..40deb9a 100644
--- a/Src/DasherCore/Alphabet/Alphabet.cpp
+++ b/Src/DasherCore/Alphabet/Alphabet.cpp
@@ -50,8 +50,6 @@ CAlphabet::CAlphabet()
m_Display.push_back("");
m_Colours.push_back(-1);
m_Foreground.push_back("");
-
- m_pTrainingHelper = new CTrainingHelper;
}
/////////////////////////////////////////////////////////////////////////////
@@ -66,8 +64,6 @@ CAlphabet::CAlphabet(const CAlphIO::AlphInfo &AlphInfo)
m_StartConversionSymbol = -1;
m_EndConversionSymbol = -1;
- m_pTrainingHelper = NULL;
-
m_strDefaultContext = AlphInfo.m_strDefaultContext;
// Set miscellaneous options
@@ -122,10 +118,6 @@ CAlphabet::CAlphabet(const CAlphIO::AlphInfo &AlphInfo)
#endif
}
-CAlphabet::~CAlphabet() {
- delete m_pTrainingHelper;
-}
-
/////////////////////////////////////////////////////////////////////////////
CAlphabet::utf8_length::utf8_length()
@@ -375,37 +367,6 @@ int CAlphabet::GetTextColour(symbol Symbol) {
}
}
-
-void
-CAlphabet::Train(const std::string &strUserLoc,
- const std::string &strSystemLoc,
- CTrainer *pTrainer) {
-
- std::string strTrainingFile = GetTrainingFile();
-
- if (strTrainingFile.empty()) {
-#ifdef DEBUG
- std::cerr << "Trying to load empty training file (location)" << std::endl;
-#endif
- } else {
- m_pTrainingHelper->LoadFile(strUserLoc + strTrainingFile, pTrainer, this);
- m_pTrainingHelper->LoadFile(strSystemLoc + strTrainingFile, pTrainer, this);
- }
-}
-
-void
-CAlphabet::Train(const std::string &strPath,
- CTrainer *pTrainer) {
-
- if (strPath.empty()) {
-#ifdef DEBUG
- std::cerr << "Trying to load empty training file (path)" << std::endl;
-#endif
- } else {
- m_pTrainingHelper->LoadFile(strPath, pTrainer, this);
- }
-}
-
int
CAlphabet::GetColour(symbol i, int iPhase) const {
int iColour = m_Colours[i];
diff --git a/Src/DasherCore/Alphabet/Alphabet.h b/Src/DasherCore/Alphabet/Alphabet.h
index 7a8b885..32942a6 100644
--- a/Src/DasherCore/Alphabet/Alphabet.h
+++ b/Src/DasherCore/Alphabet/Alphabet.h
@@ -31,10 +31,6 @@
#include <vector>
namespace Dasher {
- // Forward declarations
- class CTrainer;
- class CTrainingHelper;
-
///
/// \defgroup Alphabet Alphabet information
/// @{
@@ -44,9 +40,6 @@ namespace Dasher {
CAlphabet();
CAlphabet(const CAlphIO::AlphInfo & AlphInfo);
- ~CAlphabet();
-
-
// Return size of alphabet, including control symbols
int GetNumberSymbols() const {
return m_Characters.size();
@@ -69,7 +62,7 @@ namespace Dasher {
return m_DefaultEncoding;
}
- std::string & GetTrainingFile() {
+ const std::string & GetTrainingFile() const {
return m_TrainingFile;
}
std::string GetGameModeFile() {
@@ -167,13 +160,6 @@ namespace Dasher {
}
SGroupInfo *m_pBaseGroup;
-
- void Train(const std::string &strUserLoc,
- const std::string &strSystemLoc,
- CTrainer *pTrainer);
-
- void Train(const std::string &strPath,
- CTrainer *pTrainer);
private:
@@ -207,9 +193,6 @@ namespace Dasher {
std::vector < std::string > m_Foreground; // stores the colour of the character foreground
// ----
- CTrainingHelper *m_pTrainingHelper;
-
-
SGroupInfo *pFirstGroup;
alphabet_map TextMap;
diff --git a/Src/DasherCore/AlphabetManagerFactory.cpp b/Src/DasherCore/AlphabetManagerFactory.cpp
index f96c257..83deadf 100644
--- a/Src/DasherCore/AlphabetManagerFactory.cpp
+++ b/Src/DasherCore/AlphabetManagerFactory.cpp
@@ -70,7 +70,7 @@ CAlphabetManagerFactory::CAlphabetManagerFactory(CDasherInterfaceBase *pInterfac
chalphabet.SetAlphabetPointer(m_pCHAlphabet);
//std::cout<<"CHALphabet size "<<chalphabet.GetSize(); [7603]
m_pLanguageModel = new CPPMPYLanguageModel(pEventHandler, pSettingsStore, chalphabet, alphabet);
-
+ m_pTrainer = new CMandarinTrainer(m_pLanguageModel, m_pAlphabet, m_pCHAlphabet);
std::cout<<"Setting PPMPY model"<<std::endl;
}
else{
@@ -97,6 +97,7 @@ CAlphabetManagerFactory::CAlphabetManagerFactory(CDasherInterfaceBase *pInterfac
m_pLanguageModel = new CPPMLanguageModel(pEventHandler, pSettingsStore, alphabet);
break;
}
+ m_pTrainer = new CTrainer(m_pLanguageModel, m_pAlphabet);
}
m_iLearnContext = m_pLanguageModel->CreateEmptyContext();
@@ -116,132 +117,3 @@ CAlphabetManagerFactory::~CAlphabetManagerFactory() {
CDasherNode *CAlphabetManagerFactory::GetRoot(CDasherNode *pParent, int iLower, int iUpper, void *pUserData) {
return m_pAlphabetManager->GetRoot(pParent, iLower, iUpper, pUserData);
}
-
-CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet) {
- m_pLanguageModel = pLanguageModel;
- m_pAlphabet = pAlphabet;
- m_pCHAlphabet = pCHAlphabet;
- m_Context = m_pLanguageModel->CreateEmptyContext();
-}
-
-void CTrainer::Train(const std::vector<symbol> &vSymbols) {
-
- for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
- m_pLanguageModel->LearnSymbol(m_Context, *it);
- }
-}
-
-//TrainMandarin is used to train Mandarin Dasher: PPMPYLanguageModel
-//Mandarin training is distinct from normal PPM training in that it uses two separate alphabets, and trains with py-character pairs. Despite so, implementation here may seem out of structure, and it could be necessary to revise later, particularly on robustness to deal with non-unicode chars
-//The training of Mandarin Dasher may evolve in to possible paths: 1.Include punctuation (more work); 2.User defined training files (not sure how); 3.Learning as one types (more work)
-//As Manager is produced, training happens in AlphabetManagerFactory
-
-void CTrainer::TrainMandarin(const std::string &strUserLoc, const std::string &strSystemLoc){
-
- //TrainMandarin takes in the Super Pin Yin Alphabet, and uses the Mandarin Character alphabet stored in private AlphabetManagerFactory
-
- std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
-
- std::string strUserPath = strUserLoc + strTrainingFile;
- std::string strSystemPath = strSystemLoc + strTrainingFile;
-
- FILE * fpUser = fopen (strUserPath.c_str(), "rb");
- FILE * fpSystem = fopen(strSystemPath.c_str(), "rb");
- FILE * fpTrain = fpSystem;
-
- if(!fpTrain) {
-
- fpTrain = fpUser;
- if(!fpTrain){
- printf("Mandarin Training File: cannot open file or incorrect directory\n");
- return;
- }
- }
- unsigned numberofchar = 0;
-
-
- size_t charsize = 1024;
-
- size_t trainBufferSize = 3*charsize*3;
- char szBuffer[trainBufferSize];
-
- std::string strChar;
- std::string strPY;
- char ctemp[4];
- CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
- std::string pyID = "ã??";
- std::vector<symbol> Symchar;
- std::vector<symbol> Sympy;
-
- while(!feof(fpTrain)){
-
- strPY.clear();
- strChar.clear();
-
- size_t iNumBytes = fread(szBuffer, 1, trainBufferSize, fpTrain);
- std::string strBuffer = std::string(szBuffer, iNumBytes);
-
- size_t lim;
- if(iNumBytes<9*charsize)
- lim = iNumBytes/9;
- else
- lim = charsize;
-
- size_t pos =0;//position in 3's counting on
- while(pos<lim*3){
-
- while(pyID.compare(strBuffer.substr(3*pos,3))!=0)
- pos++;
-
- pos++;
- // strBuffer.copy(ctemp,3,3*pos);
-
- strPY.append(strBuffer.substr(3*pos,3));
-
- pos++;
-
- //strBuffer.copy(ctemp,3,3*pos);
- strChar.append(strBuffer.substr(3*pos,3));
- std::string strtemp = strBuffer.substr(3*(pos),3);
- Symchar.clear();
- m_pCHAlphabet->GetSymbols(&Symchar, &strtemp, 0);
-
- pos++;
-
- }
- Symchar.clear();
- Sympy.clear();
- m_pCHAlphabet->GetSymbols(&Symchar, &strChar, 0);
- m_pAlphabet->GetSymbols(&Sympy, &strPY, 0);
-
- for(int i =0; i<Symchar.size(); i++){
-
- if((Symchar[i]<7603)&&(Symchar[i]>-1)){//Hack here? to prevent lan model from failing
-
- static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[i]);
- m_pLanguageModel->LearnSymbol(trainContext, Symchar[i]);
-
- }
-
- // if(Sym.size()>0)
-
- numberofchar = numberofchar + Symchar.size();
- }
-
- }
- //std::cout<<"The Length of Training file is "<<numberofchar<<" bytes/py characters"<<std::endl;
-}
-
-
-void CTrainer::Reset() {
- m_pLanguageModel->ReleaseContext(m_Context);
- m_Context = m_pLanguageModel->CreateEmptyContext();
-}
-
-CTrainer::~CTrainer() {
- m_pLanguageModel->ReleaseContext(m_Context);
-}
-
-CTrainer *CAlphabetManagerFactory::GetTrainer() {
- return new CTrainer(m_pLanguageModel, m_pAlphabet, m_pCHAlphabet);
-}
diff --git a/Src/DasherCore/AlphabetManagerFactory.h b/Src/DasherCore/AlphabetManagerFactory.h
index ddf9da7..f9002be 100644
--- a/Src/DasherCore/AlphabetManagerFactory.h
+++ b/Src/DasherCore/AlphabetManagerFactory.h
@@ -3,6 +3,7 @@
#include "AlphabetManager.h"
#include "LanguageModelling/LanguageModel.h"
+#include "Trainer.h"
class CNodeCreationManager;
@@ -11,26 +12,6 @@ namespace Dasher {
class CAlphIO;
class CDasherInterfaceBase;
- // TODO: Move this into a new file
- class CTrainer {
- public:
- CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet);
-
- void Train(const std::vector < symbol > &vSymbols);
- void TrainMandarin(const std::string &strUserLoc,
- const std::string &strSystemLoc);
-
- void Reset();
-
- ~CTrainer();
-
- private:
- CLanguageModel *m_pLanguageModel;
- CLanguageModel::Context m_Context;
- CAlphabet * m_pAlphabet;
- CAlphabet * m_pCHAlphabet;
- };
-
/// \ingroup Model
/// @{
class CAlphabetManagerFactory {
@@ -72,7 +53,7 @@ namespace Dasher {
return m_iConversionID;
};
- CTrainer *GetTrainer();
+ CTrainer *GetTrainer() {return m_pTrainer;}
private:
CAlphabetManager *m_pAlphabetManager;
@@ -81,7 +62,8 @@ namespace Dasher {
CLanguageModel::Context m_iLearnContext; // Used to add data to model as it is entered
CAlphabet *m_pAlphabet; // pointer to the alphabet
CAlphabet *m_pCHAlphabet; // pointer to the Mandarin alphabet
-
+ CTrainer *m_pTrainer;
+
int m_iConversionID;
};
/// @}
diff --git a/Src/DasherCore/DasherInterfaceBase.cpp b/Src/DasherCore/DasherInterfaceBase.cpp
index f5c99bc..cc189cd 100644
--- a/Src/DasherCore/DasherInterfaceBase.cpp
+++ b/Src/DasherCore/DasherInterfaceBase.cpp
@@ -113,8 +113,6 @@ CDasherInterfaceBase::CDasherInterfaceBase() {
m_bLastChanged = true;
- // m_pTrainingHelper = new CTrainingHelper;
-
#ifndef _WIN32_WCE
// Global logging object we can use from anywhere
g_pLogger = new CFileLogger("dasher.log",
@@ -207,7 +205,6 @@ CDasherInterfaceBase::~CDasherInterfaceBase() {
delete m_ColourIO;
delete m_AlphIO;
delete m_pNCManager;
- // delete m_pTrainingHelper;
// Do NOT delete Edit box or Screen. This class did not create them.
#ifndef _WIN32_WCE
diff --git a/Src/DasherCore/DasherInterfaceBase.h b/Src/DasherCore/DasherInterfaceBase.h
index 2c9c9b9..1e63f2f 100644
--- a/Src/DasherCore/DasherInterfaceBase.h
+++ b/Src/DasherCore/DasherInterfaceBase.h
@@ -587,7 +587,6 @@ protected:
CColourIO *m_ColourIO;
CNodeCreationManager *m_pNCManager;
CUserLogBase *m_pUserLog;
- // CTrainingHelper *m_pTrainingHelper;
/// @}
std::string strTrainfileBuffer;
diff --git a/Src/DasherCore/Makefile.am b/Src/DasherCore/Makefile.am
index 6ae670f..bd16f84 100644
--- a/Src/DasherCore/Makefile.am
+++ b/Src/DasherCore/Makefile.am
@@ -110,6 +110,8 @@ libdashercore_a_SOURCES = \
StylusFilter.h \
TimeSpan.cpp \
TimeSpan.h \
+ Trainer.cpp \
+ Trainer.h \
TrainingHelper.cpp \
TrainingHelper.h \
TwoBoxStartHandler.cpp \
diff --git a/Src/DasherCore/NodeCreationManager.cpp b/Src/DasherCore/NodeCreationManager.cpp
index 2f91bb2..604902b 100644
--- a/Src/DasherCore/NodeCreationManager.cpp
+++ b/Src/DasherCore/NodeCreationManager.cpp
@@ -22,21 +22,14 @@ CNodeCreationManager::CNodeCreationManager(Dasher::CDasherInterfaceBase *pInterf
m_pLanguageModel = m_pAlphabetManagerFactory->GetLanguageModel();
m_pAlphabet = m_pAlphabetManagerFactory->GetAlphabet();
- int iConversionID(m_pAlphabetManagerFactory->GetConversionID());
-
// Train the language model
CTrainer *pTrainer = m_pAlphabetManagerFactory->GetTrainer();
- //WZ: Mandarin Dasher Change
- if((iConversionID==2)&&(pSettingsStore->GetStringParameter(SP_ALPHABET_ID)=="Chinese Super Pin Yin, grouped by Dictionary"))
- pTrainer->TrainMandarin(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC));
- else{
- //End Mandarin Dasher Change
- m_pAlphabet->Train(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC), pTrainer);
- }
+ pTrainer->Train(GetStringParameter(SP_USER_LOC), GetStringParameter(SP_SYSTEM_LOC));
+
delete pTrainer;
-
+ int iConversionID(m_pAlphabetManagerFactory->GetConversionID());
#ifndef _WIN32_WCE
m_pControlManager = new CControlManager(this);
@@ -194,7 +187,7 @@ CNodeCreationManager::ImportTrainingText(const std::string &strPath) {
pTrainer = m_pAlphabetManagerFactory->GetTrainer();
if(m_pAlphabet && pTrainer)
- m_pAlphabet->Train(strPath, pTrainer);
+ pTrainer->Train(strPath);
delete pTrainer;
}
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
new file mode 100644
index 0000000..a2505e2
--- /dev/null
+++ b/Src/DasherCore/Trainer.cpp
@@ -0,0 +1,169 @@
+
+#include "../Common/Common.h"
+
+#include "Trainer.h"
+#include "DasherInterfaceBase.h"
+#include "LanguageModelling/PPMLanguageModel.h"
+#include "LanguageModelling/WordLanguageModel.h"
+#include "LanguageModelling/DictLanguageModel.h"
+#include "LanguageModelling/MixtureLanguageModel.h"
+#include "LanguageModelling/CTWLanguageModel.h"
+#include "LanguageModelling/PPMPYLanguageModel.h"
+
+using namespace Dasher;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG_MEMLEAKS
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet)
+ : CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel) {
+}
+
+void CTrainer::Train(const std::string &strUserLoc,
+ const std::string &strSystemLoc) {
+
+ std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
+
+ if (strTrainingFile.empty()) {
+#ifdef DEBUG
+ std::cerr << "Trying to load empty training file (location)" << std::endl;
+#endif
+ } else {
+ LoadFile(strUserLoc + strTrainingFile);
+ LoadFile(strSystemLoc + strTrainingFile);
+ }
+}
+
+
+void
+CTrainer::Train(const std::string &strPath) {
+
+ if (strPath.empty()) {
+#ifdef DEBUG
+ std::cerr << "Trying to load empty training file (path)" << std::endl;
+#endif
+ } else {
+ LoadFile(strPath);
+ }
+}
+
+void CTrainer::Train(const std::vector<symbol> &vSymbols) {
+ CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
+
+ for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
+ m_pLanguageModel->LearnSymbol(sContext, *it);
+ }
+ m_pLanguageModel->ReleaseContext(sContext);
+}
+
+CMandarinTrainer::CMandarinTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet)
+: CTrainer(pLanguageModel, pAlphabet), m_pCHAlphabet(pCHAlphabet) {
+}
+
+//TrainMandarin is used to train Mandarin Dasher: PPMPYLanguageModel
+//Mandarin training is distinct from normal PPM training in that it uses two separate alphabets, and trains with py-character pairs. Despite so, implementation here may seem out of structure, and it could be necessary to revise later, particularly on robustness to deal with non-unicode chars
+//The training of Mandarin Dasher may evolve in to possible paths: 1.Include punctuation (more work); 2.User defined training files (not sure how); 3.Learning as one types (more work)
+//As Manager is produced, training happens in AlphabetManagerFactory
+
+void CMandarinTrainer::Train(const std::string &strUserLoc, const std::string &strSystemLoc){
+
+ //TrainMandarin takes in the Super Pin Yin Alphabet, and uses the Mandarin Character alphabet stored in private AlphabetManagerFactory
+
+ std::string strTrainingFile = m_pAlphabet->GetTrainingFile();
+
+ std::string strUserPath = strUserLoc + strTrainingFile;
+ std::string strSystemPath = strSystemLoc + strTrainingFile;
+
+ FILE * fpUser = fopen (strUserPath.c_str(), "rb");
+ FILE * fpSystem = fopen(strSystemPath.c_str(), "rb");
+ FILE * fpTrain = fpSystem;
+
+ if(!fpTrain) {
+
+ fpTrain = fpUser;
+ if(!fpTrain){
+ printf("Mandarin Training File: cannot open file or incorrect directory\n");
+ return;
+ }
+ }
+ unsigned numberofchar = 0;
+
+
+ size_t charsize = 1024;
+
+ size_t trainBufferSize = 3*charsize*3;
+ char szBuffer[trainBufferSize];
+
+ std::string strChar;
+ std::string strPY;
+ //char ctemp[4];
+ CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
+ std::string pyID = "ã??";
+ std::vector<symbol> Symchar;
+ std::vector<symbol> Sympy;
+
+ while(!feof(fpTrain)){
+
+ strPY.clear();
+ strChar.clear();
+
+ size_t iNumBytes = fread(szBuffer, 1, trainBufferSize, fpTrain);
+ std::string strBuffer = std::string(szBuffer, iNumBytes);
+
+ size_t lim;
+ if(iNumBytes<9*charsize)
+ lim = iNumBytes/9;
+ else
+ lim = charsize;
+
+ size_t pos =0;//position in 3's counting on
+ while(pos<lim*3){
+
+ while(pyID.compare(strBuffer.substr(3*pos,3))!=0)
+ pos++;
+
+ pos++;
+ // strBuffer.copy(ctemp,3,3*pos);
+
+ strPY.append(strBuffer.substr(3*pos,3));
+
+ pos++;
+
+ //strBuffer.copy(ctemp,3,3*pos);
+ strChar.append(strBuffer.substr(3*pos,3));
+ std::string strtemp = strBuffer.substr(3*(pos),3);
+ Symchar.clear();
+ m_pCHAlphabet->GetSymbols(&Symchar, &strtemp, 0);
+
+ pos++;
+
+ }
+ Symchar.clear();
+ Sympy.clear();
+ m_pCHAlphabet->GetSymbols(&Symchar, &strChar, 0);
+ m_pAlphabet->GetSymbols(&Sympy, &strPY, 0);
+
+ for(int i =0; i<Symchar.size(); i++){
+
+ if((Symchar[i]<7603)&&(Symchar[i]>-1)){//Hack here? to prevent lan model from failing
+
+ static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[i]);
+ m_pLanguageModel->LearnSymbol(trainContext, Symchar[i]);
+
+ }
+
+ // if(Sym.size()>0)
+
+ numberofchar = numberofchar + Symchar.size();
+ }
+
+ }
+ //std::cout<<"The Length of Training file is "<<numberofchar<<" bytes/py characters"<<std::endl;
+}
\ No newline at end of file
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
new file mode 100644
index 0000000..fbefe6b
--- /dev/null
+++ b/Src/DasherCore/Trainer.h
@@ -0,0 +1,35 @@
+#ifndef __trainer_h__
+#define __trainer_h__
+
+#include "LanguageModelling/LanguageModel.h"
+#include "TrainingHelper.h"
+
+namespace Dasher {
+ class CAlphabet;
+ class CAlphIO;
+ class CDasherInterfaceBase;
+
+ class CTrainer : public CTrainingHelper {
+ public:
+ CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet);
+ void Train(const std::string &strUserLoc, const std::string &strSystemLoc);
+ void Train(const std::string &strPath);
+
+ protected:
+ virtual void Train(const std::vector<symbol> &vSymbols);
+ CLanguageModel *m_pLanguageModel;
+ };
+
+ class CMandarinTrainer : public CTrainer {
+ public:
+ CMandarinTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet, CAlphabet *pCHAlphabet);
+
+ virtual void Train(const std::string &strUserLoc, const std::string &strSystemLoc);
+
+ private:
+ CAlphabet * m_pCHAlphabet;
+ };
+
+}
+
+#endif
diff --git a/Src/DasherCore/TrainingHelper.cpp b/Src/DasherCore/TrainingHelper.cpp
index 325de46..4dd2719 100644
--- a/Src/DasherCore/TrainingHelper.cpp
+++ b/Src/DasherCore/TrainingHelper.cpp
@@ -35,10 +35,11 @@ static void XML_StartElement(void *pUserData, const XML_Char *szName, const XML_
static void XML_EndElement(void *pUserData, const XML_Char *szName);
static void XML_CharacterData(void *pUserData, const XML_Char *szS, int iLen);
+Dasher::CTrainingHelper::CTrainingHelper(const Dasher::CAlphabet *pAlphabet) : m_pAlphabet(pAlphabet) {
+}
+
void
-Dasher::CTrainingHelper::LoadFile(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadFile(const std::string &strFileName) {
if(strFileName == "")
return;
@@ -54,17 +55,15 @@ Dasher::CTrainingHelper::LoadFile(const std::string &strFileName,
fclose(pInputFile);
if(!strcmp(szTestBuffer, "<?xml")) {
- LoadXML(strFileName, pTrainer, pAlphabet);
+ LoadXML(strFileName);
}
else {
- LoadPlain(strFileName, pTrainer, pAlphabet);
+ LoadPlain(strFileName);
}
}
void
-Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName) {
std::ifstream in(strFileName.c_str(), std::ios::binary);
if (in.fail())
@@ -76,20 +75,16 @@ Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName,
std::vector<Dasher::symbol> vSymbols;
vSymbols.clear();
- pAlphabet->GetSymbols(vSymbols, in);
- pTrainer->Train(vSymbols);
+ m_pAlphabet->GetSymbols(vSymbols, in);
+ Train(vSymbols);
in.close();
}
void
-Dasher::CTrainingHelper::LoadXML(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet) {
+Dasher::CTrainingHelper::LoadXML(const std::string &strFileName) {
m_bInSegment = false;
- m_pAlphabet = pAlphabet;
- m_pTrainer = pTrainer;
FILE *pInput;
if((pInput = fopen(strFileName.c_str(), "r")) == (FILE *) 0) {
@@ -134,11 +129,9 @@ Dasher::CTrainingHelper::HandleStartElement(const XML_Char *szName,
void
Dasher::CTrainingHelper::HandleEndElement(const XML_Char *szName) {
if(!strcmp(szName, "segment")) {
- m_pTrainer->Reset();
-
std::vector<Dasher::symbol> vSymbols;
m_pAlphabet->GetSymbols(&vSymbols, &m_strCurrentText, false);
- m_pTrainer->Train(vSymbols);
+ Train(vSymbols);
m_bInSegment = false;
}
diff --git a/Src/DasherCore/TrainingHelper.h b/Src/DasherCore/TrainingHelper.h
index f020b28..7ec7594 100644
--- a/Src/DasherCore/TrainingHelper.h
+++ b/Src/DasherCore/TrainingHelper.h
@@ -22,8 +22,6 @@
#define __TrainingHelper_h__
#include "Alphabet/Alphabet.h"
-#include "AlphabetManagerFactory.h"
-#include "TrainingHelper.h"
#include <string>
@@ -31,10 +29,8 @@ namespace Dasher {
class CTrainingHelper {
public:
- void LoadFile(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet);
-
+ CTrainingHelper(const CAlphabet *m_pAlphabet);
+
void HandleStartElement(const XML_Char *szName,
const XML_Char **pAtts);
@@ -42,20 +38,21 @@ namespace Dasher {
void HandleCData(const XML_Char *szS,
int iLen);
-
- private:
- void LoadPlain(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet);
-
- void LoadXML(const std::string &strFileName,
- Dasher::CTrainer *pTrainer,
- const Dasher::CAlphabet *pAlphabet);
-
+
+ protected:
+ void LoadFile(const std::string &strFileName);
+
+ const Dasher::CAlphabet *m_pAlphabet;
+
+ virtual void Train(const std::vector<symbol> &symbols)=0;
+
+ private:
+ void LoadPlain(const std::string &strFileName);
+
+ void LoadXML(const std::string &strFileName);
+
bool m_bInSegment;
std::string m_strCurrentText;
- Dasher::CTrainer *m_pTrainer;
- const Dasher::CAlphabet *m_pAlphabet;
};
};
diff --git a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
index 3a8b239..ed877c4 100755
--- a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
+++ b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
@@ -356,6 +356,8 @@
19F8C7FA0C858E9900276B4F /* TrainingHelper.h in Headers */ = {isa = PBXBuildFile; fileRef = 19F8C7F80C858E9900276B4F /* TrainingHelper.h */; };
3306E0220FFD1CE60017324C /* PPMPYLanguageModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3306E0200FFD1CE60017324C /* PPMPYLanguageModel.cpp */; };
3306E0230FFD1CE60017324C /* PPMPYLanguageModel.h in Headers */ = {isa = PBXBuildFile; fileRef = 3306E0210FFD1CE60017324C /* PPMPYLanguageModel.h */; };
+ 3306E1F70FFE6CAD0017324C /* Trainer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3306E1F50FFE6CAD0017324C /* Trainer.cpp */; };
+ 3306E1F80FFE6CAD0017324C /* Trainer.h in Headers */ = {isa = PBXBuildFile; fileRef = 3306E1F60FFE6CAD0017324C /* Trainer.h */; };
335DB0FB100B332C006DB155 /* alphabet.spyDict.xml in Resources */ = {isa = PBXBuildFile; fileRef = 335DB0FA100B332C006DB155 /* alphabet.spyDict.xml */; };
335DB101100B3358006DB155 /* training_spyDict.txt in Resources */ = {isa = PBXBuildFile; fileRef = 335DB100100B3358006DB155 /* training_spyDict.txt */; };
335DB122100B3606006DB155 /* PinYinConversionHelper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1948BE760C226CFD001DFA32 /* PinYinConversionHelper.cpp */; };
@@ -772,6 +774,8 @@
29B97325FDCFA39411CA2CEA /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = /System/Library/Frameworks/Foundation.framework; sourceTree = "<absolute>"; };
3306E0200FFD1CE60017324C /* PPMPYLanguageModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PPMPYLanguageModel.cpp; sourceTree = "<group>"; };
3306E0210FFD1CE60017324C /* PPMPYLanguageModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PPMPYLanguageModel.h; sourceTree = "<group>"; };
+ 3306E1F50FFE6CAD0017324C /* Trainer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Trainer.cpp; sourceTree = "<group>"; };
+ 3306E1F60FFE6CAD0017324C /* Trainer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Trainer.h; sourceTree = "<group>"; };
335DB0FA100B332C006DB155 /* alphabet.spyDict.xml */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; path = alphabet.spyDict.xml; sourceTree = "<group>"; };
335DB100100B3358006DB155 /* training_spyDict.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = training_spyDict.txt; sourceTree = "<group>"; };
33ABFEC40FC379EA00EA2BA5 /* ButtonMultiPress.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ButtonMultiPress.cpp; sourceTree = "<group>"; };
@@ -913,6 +917,8 @@
1948BDF40C226CFC001DFA32 /* DasherCore */ = {
isa = PBXGroup;
children = (
+ 3306E1F50FFE6CAD0017324C /* Trainer.cpp */,
+ 3306E1F60FFE6CAD0017324C /* Trainer.h */,
33FC93420FEFA2FB00A9F08D /* FrameRate.cpp */,
33FC93370FEFA2C900A9F08D /* TwoPushDynamicFilter.cpp */,
33FC93380FEFA2C900A9F08D /* TwoPushDynamicFilter.h */,
@@ -1500,6 +1506,7 @@
33ABFEC70FC379EA00EA2BA5 /* ButtonMultiPress.h in Headers */,
33FC933A0FEFA2C900A9F08D /* TwoPushDynamicFilter.h in Headers */,
3306E0230FFD1CE60017324C /* PPMPYLanguageModel.h in Headers */,
+ 3306E1F80FFE6CAD0017324C /* Trainer.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@@ -1848,6 +1855,7 @@
33FC93430FEFA2FB00A9F08D /* FrameRate.cpp in Sources */,
3306E0220FFD1CE60017324C /* PPMPYLanguageModel.cpp in Sources */,
335DB122100B3606006DB155 /* PinYinConversionHelper.cpp in Sources */,
+ 3306E1F70FFE6CAD0017324C /* Trainer.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]