[dasher] Mandarin user training files!



commit 494a651ebb545092bb4d9c57f1f48dd4d2da2419
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Fri Mar 4 11:22:49 2011 +0000

    Mandarin user training files!
    
    Add separate trainText() method; default calls outputText(), Mandarin overrides.

 Src/DasherCore/AlphabetManager.cpp |   12 +++++++-----
 Src/DasherCore/AlphabetManager.h   |    3 +++
 Src/DasherCore/MandarinAlphMgr.cpp |   20 +++++++++++++++++++-
 Src/DasherCore/MandarinAlphMgr.h   |    3 +++
 4 files changed, 32 insertions(+), 6 deletions(-)
---
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index 2e0c04c..64f4a00 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -497,20 +497,22 @@ void CAlphabetManager::CSymbolNode::Output(Dasher::VECTOR_SYMBOL_PROB* pAdded, i
 
   // Track this symbol and its probability for logging purposes
   if (pAdded != NULL) {
-    pAdded->push_back(Dasher::SymbolProb(iSymbol, outputText(), Range() / (double)iNormalization));
+    pAdded->push_back(Dasher::SymbolProb(iSymbol, oEvent.m_sText, Range() / (double)iNormalization));
   }
   if(m_pMgr->m_pNCManager->GetBoolParameter(BP_LM_ADAPTIVE))
-    m_pMgr->strTrainfileBuffer += oEvent.m_sText;
+    m_pMgr->strTrainfileBuffer += trainText();
 }
 
 void CAlphabetManager::CSymbolNode::Undo(int *pNumDeleted) {
   DASHER_ASSERT(GetFlag(NF_SEEN));
   Dasher::CEditEvent oEvent(2, outputText(), offset());
+  //Whilst the node is still NF_SEEN, we don't want to actually delete the text
+  // (e.g. outputText() for paragraph symbols will check the edit buffer!)
+  if(m_pMgr->m_pNCManager->GetBoolParameter(BP_LM_ADAPTIVE))
+    m_pMgr->strTrainfileBuffer = m_pMgr->strTrainfileBuffer.substr( 0, m_pMgr->strTrainfileBuffer.size() - trainText().size());
+  //finally delete, the last thing we do...
   m_pMgr->m_pNCManager->InsertEvent(&oEvent);
   if (pNumDeleted) (*pNumDeleted)++;
-  if(m_pMgr->m_pNCManager->GetBoolParameter(BP_LM_ADAPTIVE))
-    m_pMgr->strTrainfileBuffer = m_pMgr->strTrainfileBuffer.substr( 0, m_pMgr->strTrainfileBuffer.size() - oEvent.m_sText.size());
-  
 }
 
 CDasherNode *CAlphabetManager::CGroupNode::RebuildParent() {
diff --git a/Src/DasherCore/AlphabetManager.h b/Src/DasherCore/AlphabetManager.h
index daf59f1..5915df0 100644
--- a/Src/DasherCore/AlphabetManager.h
+++ b/Src/DasherCore/AlphabetManager.h
@@ -142,6 +142,9 @@ namespace Dasher {
       virtual CDasherNode *RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol);
     protected:
       virtual const std::string &outputText();
+      ///Text to write to user training file/buffer when this symbol output.
+      /// Default just returns (a new string constructed from) outputText()
+      virtual std::string trainText() {return outputText();}
       /// Number of unicode _characters_ (not octets) for this symbol.
       /// Uniquely, a paragraph symbol can enter two distinct unicode characters
       /// (i.e. '\r' and '\n'); every other symbol enters only a single 
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index 574fce8..f4e464f 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -408,6 +408,24 @@ void CMandarinAlphMgr::CMandSym::RebuildForwardsFromAncestor(CAlphNode *pNewNode
 }
 
 const std::string &CMandarinAlphMgr::CMandSym::outputText() {
-  //use chinese, not pinyin, alphabet...
   return mgr()->m_CHtext[iSymbol];
 }
+
+string CMandarinAlphMgr::CMandSym::trainText() {
+  //NF_COMMITTED should be in process of being set...
+  DASHER_ASSERT(!GetFlag(NF_COMMITTED));
+  //in which case, we should have a parent (if not, we would have to
+  // have been built from string context, i.e. going backwards,
+  // in which case we would be committed already)
+  DASHER_ASSERT(Parent());
+  //so the parent should have set our m_pyParent field...
+  DASHER_ASSERT(m_pyParent);
+  int iPY = m_pyParent;
+  if (iPY==0) {
+    std::set<symbol> &py(mgr()->m_PinyinByChinese[iSymbol]);
+    DASHER_ASSERT(py.size()==1);
+    if (py.size()==1) iPY = *(py.begin());
+    else return ""; //output nothing! TODO could reset context for what follows - but don't think this should ever happen?
+  }
+  return mgr()->m_pAlphabet->m_strConversionTrainingDelimiter + mgr()->m_pAlphabet->GetText(iPY) + outputText();
+}
diff --git a/Src/DasherCore/MandarinAlphMgr.h b/Src/DasherCore/MandarinAlphMgr.h
index b904179..b2264d3 100644
--- a/Src/DasherCore/MandarinAlphMgr.h
+++ b/Src/DasherCore/MandarinAlphMgr.h
@@ -96,6 +96,9 @@ namespace Dasher {
       void RebuildForwardsFromAncestor(CAlphNode *pNewNode);
       bool isInGroup(const SGroupInfo *pGroup);
     private:
+      ///Override to output a triple (delimiter,PY,CH)
+      virtual std::string trainText();
+      ///Override to use use chinese, not pinyin, alphabet
       virtual const std::string &outputText();
       ///The Pinyin symbol used to produce this chinese symbol, if known (0 if not!)
       symbol m_pyParent;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]