[dasher] Mandarin (CConvRoot) rebuilding!



commit e720500904dbadec1724a7cabf00edb1d60730df
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Sun Feb 20 11:52:14 2011 +0000

    Mandarin (CConvRoot) rebuilding!
    
    AssignSizes => GetConversions, fills in possible syms as well as probabilities
      TODO, does this actually need to do all this reweighting etc? asserts say not!
    Extracted CAlphBase::RebuildForwardsFromParent, override in CConvRoot
     to compute m_pyParent
    CConvRoot::PopulateChildrenWithExisting indirects via CMandSym::RebuildCHSymbol

 Src/DasherCore/AlphabetManager.cpp |    8 ++-
 Src/DasherCore/AlphabetManager.h   |   10 +++-
 Src/DasherCore/MandarinAlphMgr.cpp |   98 ++++++++++++++++++++++++++----------
 Src/DasherCore/MandarinAlphMgr.h   |   21 +++++++-
 4 files changed, 104 insertions(+), 33 deletions(-)
---
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index f01b245..19075d6 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -525,8 +525,7 @@ CDasherNode *CAlphabetManager::CAlphBase::RebuildParent() {
 
     CAlphNode *pNewNode = m_pMgr->GetRoot(NULL, 0, 0, iNewOffset!=-1, iNewOffset+1);
     
-    //now fill in the new node - recursively - until it reaches us
-    m_pMgr->IterateChildGroups(pNewNode, NULL, this);
+    RebuildForwardsFromAncestor(pNewNode);
     
     if (GetFlag(NF_SEEN)) {
       for (CDasherNode *pNode=this; (pNode=pNode->Parent()); pNode->SetFlag(NF_SEEN, true));
@@ -535,6 +534,11 @@ CDasherNode *CAlphabetManager::CAlphBase::RebuildParent() {
   return Parent();
 }
 
+void CAlphabetManager::CAlphBase::RebuildForwardsFromAncestor(CAlphNode *pNewNode) {
+  //now fill in the new node - recursively - until it reaches us
+  m_pMgr->IterateChildGroups(pNewNode, NULL, this);
+}
+
 // TODO: Shouldn't there be an option whether or not to learn as we write?
 // For want of a better solution, game mode exemption explicit in this function
 void CAlphabetManager::CSymbolNode::SetFlag(int iFlag, bool bValue) {
diff --git a/Src/DasherCore/AlphabetManager.h b/Src/DasherCore/AlphabetManager.h
index 804b824..d0c4695 100644
--- a/Src/DasherCore/AlphabetManager.h
+++ b/Src/DasherCore/AlphabetManager.h
@@ -63,6 +63,8 @@ namespace Dasher {
     class CAlphBase : public CDasherNode {
     public:
       CAlphabetManager *mgr() {return m_pMgr;}
+      ///Rebuilds this node's parent by recreating the previous 'root' node,
+      /// then calling RebuildForwardsFromAncestor
       CDasherNode *RebuildParent();
       ///Called to build a symbol (leaf) node which is a descendant of the symbol or root node preceding this.
       /// Default implementation just calls the manager's CreateSymbolNode method to create a new node,
@@ -81,6 +83,12 @@ namespace Dasher {
       /// \param pParent parent of the symbol node to create; could be the previous root, or an intervening node (e.g. group)      
       virtual CDasherNode *RebuildGroup(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strEnc, int iBkgCol, const SGroupInfo *pInfo);
     protected:
+      ///Called in process of rebuilding parent: fill in the hierarchy _beneath_ the
+      /// the previous root node, by calling IterateChildGroups passing this node as
+      /// last parameter, until the point where this node fits in is found,
+      /// at which point RebuildSymbol/Group should graft it in.
+      /// \param pNewNode newly-created root node beneath which this node should fit
+      virtual void RebuildForwardsFromAncestor(CAlphNode *pNewNode);
       CAlphBase(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, int iColour, const std::string &strDisplayText, CAlphabetManager *pMgr);
       CAlphabetManager *m_pMgr;
       ///Number of unicode characters entered by this node; i.e., the number
@@ -91,7 +99,7 @@ namespace Dasher {
       /// (as a symbol or subgroup), any number of levels beneath it
       virtual bool isInGroup(const SGroupInfo *pGroup)=0;
     };
-    class CGroupNode;
+
     ///Additionally stores LM contexts and probabilities calculated therefrom
     class CAlphNode : public CAlphBase {
     public:
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index 072cfa3..f556ad4 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -94,6 +94,8 @@ CMandarinAlphMgr::CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreati
       DASHER_ASSERT(vSyms.size()==1 && vSyms[0]!=0); //i.e. conversion is exactly one chinese symbol
       DASHER_ASSERT(m_pCHAlphabet->GetText(vSyms[0]) == *it);
       m_pConversionsBySymbol[i].insert(vSyms[0]);
+      //Also the reverse lookup: (valid/used chinese symbol number) -> (pinyin by which it could be produced)
+      m_PinyinByChinese[vSyms[0]].insert(i);
     }
   }
   //that leaves m_pConversionsBySymbol as desired.
@@ -187,14 +189,12 @@ int CMandarinAlphMgr::CConvRoot::ExpectedNumChildren() {
 }
 
 void CMandarinAlphMgr::CConvRoot::PopulateChildren() {
+  PopulateChildrenWithExisting(NULL);
+}
+
+void CMandarinAlphMgr::CConvRoot::PopulateChildrenWithExisting(CMandSym *existing) {
   if (m_vChInfo.empty()) {
-    const set<symbol> &convs(mgr()->m_pConversionsBySymbol[m_pySym]);
-    for(set<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
-      m_vChInfo.push_back(std::pair<symbol, unsigned int>(*it,0));
-    }
-    //ACL I think it's a good idea to keep those in a consistent order - symbol order will do nicely
-    sort(m_vChInfo.begin(),m_vChInfo.end());
-    mgr()->AssignSizes(m_vChInfo, iContext);
+    mgr()->GetConversions(m_vChInfo,m_pySym, iContext);
   }
   
   int iCum(0);
@@ -205,10 +205,11 @@ void CMandarinAlphMgr::CConvRoot::PopulateChildren() {
     const unsigned int iLbnd(iCum), iHbnd(iCum + it->second);
     
     iCum = iHbnd;
-    CMandSym *pNewNode = mgr()->CreateCHSymbol(this, this->iContext, iLbnd, iHbnd, "", it->first, m_pySym);
+    CMandSym *pNewNode = (existing)
+      ? existing->RebuildCHSymbol(this, iLbnd, iHbnd, it->first)
+      : mgr()->CreateCHSymbol(this, this->iContext, iLbnd, iHbnd, "", it->first, m_pySym);
     
     DASHER_ASSERT(GetChildren().back()==pNewNode);
-    
   }
 }
 
@@ -245,31 +246,29 @@ void CMandarinAlphMgr::CConvRoot::SetFlag(int iFlag, bool bValue) {
   CDasherNode::SetFlag(iFlag,bValue);
 }
 
-void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChildren, Dasher::CLanguageModel::Context context) {
+void CMandarinAlphMgr::GetConversions(std::vector<pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context) {
+
+  const set<symbol> &convs(m_pConversionsBySymbol[pySym]);
+  for(set<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
+    vChildren.push_back(std::pair<symbol, unsigned int>(*it,0));
+  }
+  //ACL I think it's a good idea to keep those in a consistent order - symbol order will do nicely
+  sort(vChildren.begin(),vChildren.end());
 
   const uint64 iNorm(m_pNCManager->GetLongParameter(LP_NORMALIZATION));
   const unsigned int uniform((m_pNCManager->GetLongParameter(LP_UNIFORM)*iNorm)/1000);
-  
-  int iRemaining(iNorm);
-  
-  uint64 sumProb=0;
-  
-  //CLanguageModel::Context iCurrentContext;
-  
-  //  std::cout<<"size of symbolstore "<<SymbolStore.size()<<std::endl;  
-  
-  //  std::cout<<"norm input: "<<nonuniform_norm/(iSymbols/iNChildren/100)<<std::endl;
-  
+    
   //ACL pass in iNorm and uniform directly - GetPartProbs distributes the last param between
   // however elements there are in vChildren...
   static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->GetPartProbs(context, vChildren, iNorm, uniform);
   
   //std::cout<<"after get probs "<<std::endl;
   
+  uint64 sumProb=0;  
   for (std::vector<pair<symbol,unsigned int> >::const_iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
     sumProb += it->second;
   }
-  
+  DASHER_ASSERT(sumProb==iNorm);
   //  std::cout<<"Sum Prob "<<sumProb<<std::endl;
   //  std::cout<<"norm "<<nonuniform_norm<<std::endl;
   
@@ -279,6 +278,7 @@ void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChi
   
  // std::cout<<"sumProb "<<sumProb<<std::endl;
   
+  int iRemaining(iNorm);
   for (std::vector<pair<symbol,unsigned int> >::iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
     DASHER_ASSERT(it->first>-1); //ACL Will's code tested for both these conditions explicitly, and if so 
     DASHER_ASSERT(sumProb>0);   //then used a probability of 0. I don't think either
@@ -299,6 +299,7 @@ void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChi
     // std::cout<<"symbols size "<<SymbolStore.size()<<std::endl;
     // std::cout<<"Symbols address "<<&SymbolStore<<std::endl;
   }
+  DASHER_ASSERT(iRemaining==0);
   
   //std::cout<<"iRemaining "<<iRemaining<<std::endl;
   
@@ -328,8 +329,7 @@ CMandarinAlphMgr::CMandSym::CMandSym(CDasherNode *pParent, int iOffset, unsigned
 }
 
 CDasherNode *CMandarinAlphMgr::CMandSym::RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol) {
-  //TODO m_pyParent should have been computed in RebuildParent()
-  DASHER_ASSERT(m_pyParent!=0);
+  DASHER_ASSERT(m_pyParent!=0); //should have been computed in RebuildForwardsFromAncestor()
   if (iSymbol==m_pyParent) {
     //create the PY node that lead to this chinese
     if (mgr()->m_pConversionsBySymbol[m_pyParent].size()==1) {
@@ -340,19 +340,63 @@ CDasherNode *CMandarinAlphMgr::CMandSym::RebuildSymbol(CAlphNode *pParent, unsig
     }
     //ok, will be a PY-to-Chinese conversion choice
     CConvRoot *pConv = mgr()->CreateConvRoot(pParent, iLbnd, iHbnd, strGroup, iSymbol);
-    //TODO equivalent of IterateChildGroups - make CConvRoot generate children, but replacing one with this
+    pConv->PopulateChildrenWithExisting(this);
     return pConv;
   }
   return CAlphBase::RebuildSymbol(pParent, iLbnd, iHbnd, strGroup, iBkgCol, iSymbol);
 }
 
 bool CMandarinAlphMgr::CMandSym::isInGroup(const SGroupInfo *pGroup) {
-  //TODO m_pyParent should have been computed in RebuildParent()
-  DASHER_ASSERT(m_pyParent!=0);
+  DASHER_ASSERT(m_pyParent!=0); //should have been computed in RebuildForwardsFromAncestor()
   //pinyin group contains the pinyin-"symbol"=CConvRoot which we want to be our parent...
   return pGroup->iStart <= m_pyParent && pGroup->iEnd > m_pyParent;
 }
 
+CMandarinAlphMgr::CMandSym *CMandarinAlphMgr::CMandSym::RebuildCHSymbol(CConvRoot *pParent, unsigned int iLbnd, unsigned int iHbnd, symbol iNewSym) {
+  if (iNewSym == this->iSymbol) {
+    //reuse existing node
+    SetParent(pParent);
+    SetRange(iLbnd, iHbnd);
+    return this;
+  }
+  return mgr()->CreateCHSymbol(pParent, pParent->iContext, iLbnd, iHbnd, "", iNewSym, pParent->m_pySym);
+}
+
+void CMandarinAlphMgr::CMandSym::RebuildForwardsFromAncestor(CAlphNode *pNewNode) {
+  if (m_pyParent==0) {
+    set<symbol> &possiblePinyin(mgr()->m_PinyinByChinese[iSymbol]);
+    if (possiblePinyin.size() > 1) {
+      //need to compare pinyin symbols; so compute probability of this (chinese) sym, for each:
+      // i.e. P(pinyin) * P(this chinese | pinyin)
+      const vector<unsigned int> &vPinyinProbs(*(pNewNode->GetProbInfo()));
+      long bestProb=0; //of this chinese, over LP_NORMALIZATION _squared_
+      for (set<symbol>::iterator p_it = possiblePinyin.begin(); p_it!=possiblePinyin.end(); p_it++) {
+        //compute probability of each chinese symbol for that pinyin (=by filtering)
+        // context is the same as the ancestor = previous chinese, as pinyin not part of context
+        vector<pair<symbol, unsigned int> > vChineseProbs;
+        mgr()->GetConversions(vChineseProbs, *p_it, pNewNode->iContext);
+        //now find us in that list
+        long thisProb; //i.e. P(this pinyin) * P(this chinese | this pinyin)
+        for (vector<pair<symbol,unsigned int> >::iterator c_it = vChineseProbs.begin(); ;) {
+          if (c_it->first == iSymbol) {
+            //found P(this chinese sym | pinyin). Compute overall...
+            thisProb = c_it->second * vPinyinProbs[*p_it];
+            break;
+          }
+          c_it++;
+          DASHER_ASSERT(c_it!=vChineseProbs.end()); //gotta find this chinese sym somewhere...
+        }
+        //see if that works out better than for the other possible pinyin...
+        if (thisProb > bestProb) {
+          bestProb = thisProb;
+          m_pyParent = *p_it;
+        }
+      }
+    } else m_pyParent = *(possiblePinyin.begin());
+  }
+  CSymbolNode::RebuildForwardsFromAncestor(pNewNode);
+}
+
 const std::string &CMandarinAlphMgr::CMandSym::outputText() {
   //use chinese, not pinyin, alphabet...
   return mgr()->m_pCHAlphabet->GetText(iSymbol);
diff --git a/Src/DasherCore/MandarinAlphMgr.h b/Src/DasherCore/MandarinAlphMgr.h
index 47d610d..31dd4bf 100644
--- a/Src/DasherCore/MandarinAlphMgr.h
+++ b/Src/DasherCore/MandarinAlphMgr.h
@@ -48,6 +48,7 @@ namespace Dasher {
     CAlphNode *GetRoot(CDasherNode *pParent, unsigned int iLower, unsigned int iUpper, bool bEnteredLast, int iOffset);    
     
   protected:
+    class CConvRoot;
     ///Subclass of CSymbolNode for (converted) chinese-alphabet symbols:
     /// these use the chinese alphabet in place of the pinyin one for text to display/enter,
     /// and get their colour using GetCHColour rather than GetColour.
@@ -57,8 +58,11 @@ namespace Dasher {
       ///Symbol constructor: display text from CHAlphabet, colour from GetCHColour
       /// \param strGroup caption of any group(s) containing this symbol for which no nodes created; prepended to display text.
       CMandSym(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, CMandarinAlphMgr *pMgr, symbol iSymbol, symbol pyParent);
-    protected:
       CDasherNode *RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol);
+      CMandSym *RebuildCHSymbol(CConvRoot *pParent, unsigned int iLbnd, unsigned int iHbnd, symbol iNewSym);
+    protected:
+      ///Override to compute which pinyin symbol to make our parent...
+      void RebuildForwardsFromAncestor(CAlphNode *pNewNode);
       bool isInGroup(const SGroupInfo *pGroup);
     private:
       virtual const std::string &outputText();
@@ -74,15 +78,16 @@ namespace Dasher {
       CConvRoot(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, CMandarinAlphMgr *pMgr, symbol pySym);
       CMandarinAlphMgr *mgr() {return static_cast<CMandarinAlphMgr *>(CAlphBase::mgr());}
       void PopulateChildren();
+      void PopulateChildrenWithExisting(CMandSym *existing);
       int ExpectedNumChildren();
       CLanguageModel::Context iContext;
       void SetFlag(int iFlag, bool bValue);
+      const symbol m_pySym;
       CDasherNode *RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol);
     protected:
       bool isInGroup(const SGroupInfo *pGroup);
     private:        
       std::vector<std::pair<symbol, unsigned int> > m_vChInfo;
-      symbol m_pySym;
     };
     ///Called to create the node for a pinyin leaf symbol;
     /// Overridden to call either CreateConvRoot or CreateCHSymbol, according to #chinese symbols under specified pinyin
@@ -105,7 +110,11 @@ namespace Dasher {
     /// \param pyParent pinyin-alphabet symbol which was used to enter this chinese symbol (if known, else 0)
     CMandSym *CreateCHSymbol(CDasherNode *pParent, CLanguageModel::Context iContext, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, symbol iCHsym, symbol pyParent);
 
-    void AssignSizes(std::vector<std::pair<symbol,unsigned int> > &vChildren, Dasher::CLanguageModel::Context context);
+    ///Gets the possible chinese symbols for a pinyin one, along with their probabilities in the specified context.
+    ///Probabilities are computed by CPPMPYLanguageModel::GetPartProbs, then renormalized here. (TODO unnecessary?)
+    /// \param vChildren initially empty vector which procedure fills with pairs: first element chinese symbol number,
+    /// second element probability (/LP_NORMALIZATION).    
+    void GetConversions(std::vector<std::pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context);
 
     ///Gets colour for a specified chinese symbol and offset.
     /// Wraps CHalphabet getcolour in case anything specified; if not,
@@ -118,6 +127,12 @@ namespace Dasher {
     ///Indexed by SPY (syll+tone) alphabet symbol number,
     // the set of CHAlphabet symbols it can be converted to.
     std::set<symbol> *m_pConversionsBySymbol;
+
+    ///Indexed by chinese-alphabet symbol number (sparsely: where multiple
+    /// chinese-alphabet symbols have the same text, we use only the one
+    /// returned by CAlphabetMap::GetSymbols() for that text)
+    /// the set of pinyin syllable+tones which could yield that symbol.
+    std::map<symbol,std::set<symbol> > m_PinyinByChinese;
   };
   /// @}
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]