[dasher] MandarinAlphMgr: sort conversions biggest first until threshold, then uniform



commit fe8f08ac6cee9658360f3786393f0c36de2540e0
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Mon Apr 11 19:10:54 2011 +0100

    MandarinAlphMgr: sort conversions biggest first until threshold, then uniform
    
    LM predicts probabilities for all symbols; we take these in descending
     probability order and append them into the list of conversions until the
     cumulative probability assigned reaches LP_PY_PROB_SORT_THRES (as a %age);
     the remaining probability mass is then assigned uniformly between all
     remaining symbols, which are kept in the order they appeared in the alphabet
     (we're hoping this is useful - e.g. order of "decreasing stroke count"?)
    
    This makes it substantially easier to search for a symbol in a long list.

 Src/DasherCore/MandarinAlphMgr.cpp |  138 ++++++++++++++++++------------------
 Src/DasherCore/MandarinAlphMgr.h   |    5 +-
 Src/DasherCore/Parameters.h        |    3 +-
 3 files changed, 75 insertions(+), 71 deletions(-)
---
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index 0052da7..d0e7e3d 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -48,7 +48,7 @@ static char THIS_FILE[] = __FILE__;
 
 CMandarinAlphMgr::CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet, const CAlphabetMap *pAlphMap)
   : CAlphabetManager(pInterface, pNCManager, pAlphabet, pAlphMap),
-    m_pConversionsBySymbol(new set<symbol>[GetAlphabet()->GetNumberTextSymbols()+1]) {
+    m_pConversionsBySymbol(new vector<symbol>[GetAlphabet()->GetNumberTextSymbols()+1]) {
   DASHER_ASSERT(pAlphabet->m_iConversionID==2);
       
   //the CHAlphabet contains a group for each SPY syllable+tone, with symbols being chinese characters.      
@@ -104,7 +104,7 @@ CMandarinAlphMgr::CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreati
         m_CHAlphabetMap.Add(text,target);
       }
       DASHER_ASSERT(m_CHtext[m_CHAlphabetMap.Get(text)] == text);
-      m_pConversionsBySymbol[i].insert(target);
+      m_pConversionsBySymbol[i].push_back(target);
       //Also the reverse lookup: (rehashed chinese symbol number) -> (pinyin by which it could be produced)
       m_PinyinByChinese[target].insert(i);
     }
@@ -259,82 +259,84 @@ void CMandarinAlphMgr::CConvRoot::SetFlag(int iFlag, bool bValue) {
   CDasherNode::SetFlag(iFlag,bValue);
 }
 
-void CMandarinAlphMgr::GetConversions(std::vector<pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context) {
+// For sorting pair<symbol, probability>s into descending order
+// (most probable first, hence sense of >)
+bool CompSecond(const pair<symbol,unsigned int> &a, const pair<symbol,unsigned int> &b) {
+  return a.second>b.second;
+}
 
-  const set<symbol> &convs(m_pConversionsBySymbol[pySym]);
-  for(set<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
-    vChildren.push_back(std::pair<symbol, unsigned int>(*it,0));
-  }
-  //ACL I think it's a good idea to keep those in a consistent order - symbol order will do nicely
-  sort(vChildren.begin(),vChildren.end());
+void CMandarinAlphMgr::GetConversions(std::vector<pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context) {
 
-  const uint64 iNorm(m_pNCManager->GetLongParameter(LP_NORMALIZATION));
-  const unsigned int uniform((m_pNCManager->GetLongParameter(LP_UNIFORM)*iNorm)/1000);
-    
-  //ACL pass in iNorm and uniform directly - GetPartProbs distributes the last param between
-  // however elements there are in vChildren...
-  static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->GetPartProbs(context, vChildren, iNorm, uniform);
-  
-  //std::cout<<"after get probs "<<std::endl;
+  const vector<symbol> &convs(m_pConversionsBySymbol[pySym]);
+
+  //Symbols which we are including with the probabilities predicted by the LM.
+  // We do this for the most probable symbols first, up to at least BY_PY_PROB_SORT_THRES
+  // (a percentage); after this, the remaining probability mass is distributed
+  // uniformly between the remaining symbols, which are kept in alphabet order (after the
+  // more probable ones).
+  //Two degenerate cases: PROB_SORT_THRES=0 => all (legal) ch symbols predicted uniformly
+  // PROB_SORT_THRES=100 => all symbols put into probability order
+  set<symbol> haveProbs;
+  uint64 iRemaining(m_pNCManager->GetLongParameter(LP_NORMALIZATION));
   
-  uint64 sumProb=0;  
-  for (std::vector<pair<symbol,unsigned int> >::const_iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
-    sumProb += it->second;
-  }
-  DASHER_ASSERT(sumProb==iNorm);
-  //  std::cout<<"Sum Prob "<<sumProb<<std::endl;
-  //  std::cout<<"norm "<<nonuniform_norm<<std::endl;
-  
-  //Match, sumProbs = nonuniform_norm  
-  //but fix one element 'Da4'
-  // Finally, iterate through the nodes and actually assign the sizes.
-  
- // std::cout<<"sumProb "<<sumProb<<std::endl;
-  
-  int iRemaining(iNorm);
-  for (std::vector<pair<symbol,unsigned int> >::iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
-    DASHER_ASSERT(it->first>-1); //ACL Will's code tested for both these conditions explicitly, and if so 
-    DASHER_ASSERT(sumProb>0);   //then used a probability of 0. I don't think either
-                                //should ever happen if the alphabet files are right (there'd have to
-                                //be either no conversions of the syllable+tone, or else the LM'd have
-                                //to assign zero probability to each), so I'm removing these tests for now...
-    iRemaining -= it->second = (it->second*iNorm)/sumProb;
-    if (it->second==0) {
-#ifdef DEBUG
-      std::cout << "WARNING: Erasing zero-probability conversion with symbol " << it->first << std::endl;
-#endif
-      vChildren.erase(it--);
+  if (long percent=m_pNCManager->GetLongParameter(LP_PY_PROB_SORT_THRES)) {
+    const uint64 iNorm(iRemaining);
+    const unsigned int uniform((m_pNCManager->GetLongParameter(LP_UNIFORM)*iNorm)/1000);
+    
+    //Set up list of symbols with blank probability entries...
+    for(vector<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
+      vChildren.push_back(std::pair<symbol, unsigned int>(*it,0));
     }
-    //  std::cout<<pNode->pszConversion<<std::endl;
-    // std::cout<<pNode->Symbol<<std::endl;
-    //    std::cout<<"Probs i "<<pNode<<std::endl;
-    // std::cout<<"Symbol i"<<SymbolStore[iIdx]<<std::endl;
-    // std::cout<<"symbols size "<<SymbolStore.size()<<std::endl;
-    // std::cout<<"Symbols address "<<&SymbolStore<<std::endl;
-  }
-  DASHER_ASSERT(iRemaining==0);
-  
-  //std::cout<<"iRemaining "<<iRemaining<<std::endl;
-  
-  
-  // Last of all, allocate anything left over due to rounding error
+    
+    //Then call LM to fill in the probs, passing iNorm and uniform directly -
+    // GetPartProbs distributes the last param between however elements there are in vChildren...
+    static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->GetPartProbs(context, vChildren, iNorm, uniform);
   
-  int iLeft(vChildren.size());
+    //std::cout<<"after get probs "<<std::endl;
   
-  for (std::vector<pair<symbol,unsigned int> >::iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
-    int iDiff(iRemaining / iLeft);
+    uint64 sumProb=0;  
+    for (std::vector<pair<symbol,unsigned int> >::const_iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
+      sumProb += it->second;
+    }
+    DASHER_ASSERT(sumProb==iNorm);
     
-    it->second += iDiff;
+    //Sort all symbols into probability order (highest first)
+    stable_sort(vChildren.begin(), vChildren.end(), CompSecond);
+    if (percent>=100) return; //and that's what's required.
     
-    iRemaining -= iDiff;
-    --iLeft;
+    //ok, some symbols as predicted by LM, others not...
+    vector<pair<symbol,unsigned int> > probOrder;
+    swap(vChildren, probOrder);
+    const unsigned int stop(iNorm - (iNorm*percent)/100);//intermediate values are uint64
+    for (vector<pair<symbol, unsigned int> >::iterator it=probOrder.begin(); iRemaining>stop; it++) {
+      //assert: the remaining probability mass, divided by the remaining symbols,
+      // (i.e. the probability mass each symbol would receive if we uniformed the rest)
+      // must be less than the probability predicted by the LM (as we're processing
+      // symbols in decreasing order)
+      DASHER_ASSERT(iRemaining <= it->second*(convs.size()-vChildren.size()));
+      vChildren.push_back(*it);
+      haveProbs.insert(it->first);
+      iRemaining-=it->second;
+    }
+  }
+  //Now distribute iRemaining uniformly between all remaining symbols,
+  // keeping them in alphabet order
+  if (iRemaining) {
+    unsigned int iEach(iRemaining / (convs.size() - haveProbs.size()));
+    for (vector<symbol>::const_iterator it=convs.begin(); it!=convs.end(); it++) {
+      if (haveProbs.count(*it)==0)
+        vChildren.push_back(pair<symbol,unsigned int>(*it,iEach));
+    }
     
-    //    std::cout<<"Node size for "<<pNode->pszConversion<<std::endl;
-    //std::cout<<"is "<<pNode->NodeSize<<std::endl;
+    //account for rounding error by topping up
+    DASHER_ASSERT(vChildren.size() == convs.size());
+    iRemaining -= iEach * (convs.size() - haveProbs.size());
+    unsigned int iLeft = vChildren.size();
+    for (vector<pair<symbol, unsigned int> >::iterator it=vChildren.end(); iRemaining && it-- != vChildren.begin();) {
+      const unsigned int p(iRemaining / iLeft);
+      it->second+=p; iRemaining-=p; iLeft--;
+    }
   }
-  
-  DASHER_ASSERT(iRemaining == 0);
-  
 }
 
 CMandarinAlphMgr::CMandSym::CMandSym(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, CMandarinAlphMgr *pMgr, symbol iSymbol, symbol pyParent)
diff --git a/Src/DasherCore/MandarinAlphMgr.h b/Src/DasherCore/MandarinAlphMgr.h
index b2264d3..cdd2047 100644
--- a/Src/DasherCore/MandarinAlphMgr.h
+++ b/Src/DasherCore/MandarinAlphMgr.h
@@ -167,8 +167,9 @@ namespace Dasher {
     CAlphabetMap m_CHAlphabetMap;
     
     ///Indexed by SPY (syll+tone) alphabet symbol number,
-    // the set of CHAlphabet symbols it can be converted to.
-    std::set<symbol> *m_pConversionsBySymbol;
+    // the CHAlphabet symbols it can be converted to, in order
+    // of appearance in the CHAlphabet group.
+    std::vector<symbol> *m_pConversionsBySymbol;
 
     ///Indexed by chinese-alphabet symbol number (sparsely: where multiple
     /// chinese-alphabet symbols have the same text, we use only the one
diff --git a/Src/DasherCore/Parameters.h b/Src/DasherCore/Parameters.h
index 5484b36..39511a7 100644
--- a/Src/DasherCore/Parameters.h
+++ b/Src/DasherCore/Parameters.h
@@ -53,7 +53,7 @@ enum {
 enum { 
   LP_ORIENTATION = END_OF_BPS, LP_REAL_ORIENTATION, LP_MAX_BITRATE, LP_FRAMERATE,
   LP_VIEW_ID, LP_LANGUAGE_MODEL_ID, LP_DASHER_FONTSIZE, LP_SHAPE_TYPE,
-  LP_UNIFORM, LP_YSCALE, LP_MOUSEPOSDIST, LP_STOP_IDLETIME,
+  LP_UNIFORM, LP_YSCALE, LP_MOUSEPOSDIST, LP_STOP_IDLETIME, LP_PY_PROB_SORT_THRES,
   LP_LM_MAX_ORDER, LP_LM_EXCLUSION,
   LP_LM_UPDATE_EXCLUSION, LP_LM_ALPHA, LP_LM_BETA,
   LP_LM_MIXTURE, LP_NORMALIZATION, LP_LINE_WIDTH, LP_GEOMETRY,
@@ -200,6 +200,7 @@ static lp_table longparamtable[] = {
   {LP_YSCALE, "YScaling", PERS, 0, "YScaling"},
   {LP_MOUSEPOSDIST, "MousePositionBoxDistance", PERS, 50, "MousePositionBoxDistance"},
   {LP_STOP_IDLETIME, "StopIdleTime", PERS, 1000, "StopIdleTime" },
+  {LP_PY_PROB_SORT_THRES, "PYProbabilitySortThreshold", PERS, 85, "Sort converted syms in descending probability order up to this %age"},
   {LP_LM_MAX_ORDER, "LMMaxOrder", PERS, 5, "LMMaxOrder"},
   {LP_LM_EXCLUSION, "LMExclusion", PERS, 0, "LMExclusion"},
   {LP_LM_UPDATE_EXCLUSION, "LMUpdateExclusion", PERS, 1, "LMUpdateExclusion"},



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]