[dasher] Fix Mandarin training & LM contexts!



commit 979579688bfa41f476d8e7729a1370c20828f6b8
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Mon Feb 7 23:41:07 2011 +0000

    Fix Mandarin training & LM contexts!
    
    Make TrainingHelper::LoadFile virtual, so MandarinHelper::LoadFile is used (!)
    Fix & tidy MandarinTrainer::LoadFile
    
    Generate appropriate context for child symbols
     (child symbol was used to update parent node's context, not child node!)

 Src/DasherCore/MandarinAlphMgr.cpp |    2 +-
 Src/DasherCore/Trainer.cpp         |   55 +++++++++++++----------------------
 Src/DasherCore/TrainingHelper.h    |    2 +-
 3 files changed, 23 insertions(+), 36 deletions(-)
---
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index a75a856..60f0550 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -181,7 +181,7 @@ void CMandarinAlphMgr::CConvRoot::PopulateChildren() {
     
     pNewNode->iContext = m_pMgr->m_pLanguageModel->CloneContext(this->iContext);
       
-    m_pMgr->m_pLanguageModel->EnterSymbol(iContext, it->first); // TODO: Don't use symbols?      
+    m_pMgr->m_pLanguageModel->EnterSymbol(pNewNode->iContext, it->first); // TODO: Don't use symbols?      
       
     DASHER_ASSERT(GetChildren().back()==pNewNode);
     
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
index fed75f1..9c84991 100644
--- a/Src/DasherCore/Trainer.cpp
+++ b/Src/DasherCore/Trainer.cpp
@@ -75,47 +75,34 @@ void CMandarinTrainer::LoadFile(const std::string &strPath) {
     else
       lim = charsize;
     
-    size_t pos =0;//position in 3's counting on 
-    while(pos<lim*3){
+    for (size_t pos=0;;) { //position in 3's counting on 
 
-      while(pyID.compare(strBuffer.substr(3*pos,3))!=0)
-	pos++;
+      while(pos<lim*3)
+        if (pyID.compare(strBuffer.substr(3*pos++,3))==0) break;
+      //leave pos just after the pyID symbol
       
-      pos++;
-      //      strBuffer.copy(ctemp,3,3*pos);
-      
-      strPY.append(strBuffer.substr(3*pos,3));
- 
-      pos++;
+      if (pos+1>=lim*3) break;
+      strPY=strBuffer.substr(3*pos++,3);
  
       //strBuffer.copy(ctemp,3,3*pos);
-      strChar.append(strBuffer.substr(3*pos,3));
-      std::string strtemp = strBuffer.substr(3*(pos),3);
+      strChar=strBuffer.substr(3*pos++,3);
+
       Symchar.clear();
-      m_pCHAlphabet->GetSymbols(Symchar, strtemp);
+      Sympy.clear();
+
+      m_pCHAlphabet->GetSymbols(Symchar, strChar);
+      m_pAlphabet->GetSymbols(Sympy, strPY);      
+      DASHER_ASSERT(Symchar.size()==1);
+      DASHER_ASSERT(Sympy.size()==1);
+#ifdef DEBUG
+      if (Symchar[0]<=0)
+        std::cout << "Unknown chinese character " << strChar << std::endl;
+#endif
 
-      pos++;
-          
+      static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[0]); 
+      m_pLanguageModel->LearnSymbol(trainContext, Symchar[0]);
+      numberofchar++;
     }
-    Symchar.clear();
-    Sympy.clear();
-    m_pCHAlphabet->GetSymbols(Symchar, strChar);
-    m_pAlphabet->GetSymbols(Sympy, strPY);      
-    
-    for(unsigned int i =0; i<Symchar.size(); i++){
-
-      if((Symchar[i]<7603)&&(Symchar[i]>-1)){//Hack here? to prevent lan model from failing
-	
-	static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[i]); 
-	m_pLanguageModel->LearnSymbol(trainContext, Symchar[i]);
-	
-      }
-
-      // if(Sym.size()>0)
-      
-      numberofchar = numberofchar + Symchar.size();     
-    }       
-    
   }
   //std::cout<<"The Length of Training file is  "<<numberofchar<<" bytes/py characters"<<std::endl;  
 }
diff --git a/Src/DasherCore/TrainingHelper.h b/Src/DasherCore/TrainingHelper.h
index cac41ac..592578c 100644
--- a/Src/DasherCore/TrainingHelper.h
+++ b/Src/DasherCore/TrainingHelper.h
@@ -39,7 +39,7 @@ namespace Dasher {
     void HandleCData(const XML_Char *szS, 
 		     int iLen);
 
-    void LoadFile(const std::string &strFileName);
+    virtual void LoadFile(const std::string &strFileName);
 
   protected:
     const Dasher::CAlphabetMap *m_pAlphabet;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]