[dasher] Mandarin (CConvRoot) rebuilding!
- From: Patrick Welche <pwelche src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [dasher] Mandarin (CConvRoot) rebuilding!
- Date: Tue, 15 Mar 2011 17:12:32 +0000 (UTC)
commit e720500904dbadec1724a7cabf00edb1d60730df
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Sun Feb 20 11:52:14 2011 +0000
Mandarin (CConvRoot) rebuilding!
AssignSizes => GetConversions, fills in possible syms as well as probabilities
TODO, does this actually need to do all this reweighting etc? asserts say not!
Extracted CAlphBase::RebuildForwardsFromParent, override in CConvRoot
to compute m_pyParent
CConvRoot::PopulateChildrenWithExisting indirects via CMandSym::RebuildCHSymbol
Src/DasherCore/AlphabetManager.cpp | 8 ++-
Src/DasherCore/AlphabetManager.h | 10 +++-
Src/DasherCore/MandarinAlphMgr.cpp | 98 ++++++++++++++++++++++++++----------
Src/DasherCore/MandarinAlphMgr.h | 21 +++++++-
4 files changed, 104 insertions(+), 33 deletions(-)
---
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index f01b245..19075d6 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -525,8 +525,7 @@ CDasherNode *CAlphabetManager::CAlphBase::RebuildParent() {
CAlphNode *pNewNode = m_pMgr->GetRoot(NULL, 0, 0, iNewOffset!=-1, iNewOffset+1);
- //now fill in the new node - recursively - until it reaches us
- m_pMgr->IterateChildGroups(pNewNode, NULL, this);
+ RebuildForwardsFromAncestor(pNewNode);
if (GetFlag(NF_SEEN)) {
for (CDasherNode *pNode=this; (pNode=pNode->Parent()); pNode->SetFlag(NF_SEEN, true));
@@ -535,6 +534,11 @@ CDasherNode *CAlphabetManager::CAlphBase::RebuildParent() {
return Parent();
}
+void CAlphabetManager::CAlphBase::RebuildForwardsFromAncestor(CAlphNode *pNewNode) {
+ //now fill in the new node - recursively - until it reaches us
+ m_pMgr->IterateChildGroups(pNewNode, NULL, this);
+}
+
// TODO: Shouldn't there be an option whether or not to learn as we write?
// For want of a better solution, game mode exemption explicit in this function
void CAlphabetManager::CSymbolNode::SetFlag(int iFlag, bool bValue) {
diff --git a/Src/DasherCore/AlphabetManager.h b/Src/DasherCore/AlphabetManager.h
index 804b824..d0c4695 100644
--- a/Src/DasherCore/AlphabetManager.h
+++ b/Src/DasherCore/AlphabetManager.h
@@ -63,6 +63,8 @@ namespace Dasher {
class CAlphBase : public CDasherNode {
public:
CAlphabetManager *mgr() {return m_pMgr;}
+ ///Rebuilds this node's parent by recreating the previous 'root' node,
+ /// then calling RebuildForwardsFromAncestor
CDasherNode *RebuildParent();
///Called to build a symbol (leaf) node which is a descendant of the symbol or root node preceding this.
/// Default implementation just calls the manager's CreateSymbolNode method to create a new node,
@@ -81,6 +83,12 @@ namespace Dasher {
/// \param pParent parent of the symbol node to create; could be the previous root, or an intervening node (e.g. group)
virtual CDasherNode *RebuildGroup(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strEnc, int iBkgCol, const SGroupInfo *pInfo);
protected:
+ ///Called in process of rebuilding parent: fill in the hierarchy _beneath_ the
+ /// the previous root node, by calling IterateChildGroups passing this node as
+ /// last parameter, until the point where this node fits in is found,
+ /// at which point RebuildSymbol/Group should graft it in.
+ /// \param pNewNode newly-created root node beneath which this node should fit
+ virtual void RebuildForwardsFromAncestor(CAlphNode *pNewNode);
CAlphBase(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, int iColour, const std::string &strDisplayText, CAlphabetManager *pMgr);
CAlphabetManager *m_pMgr;
///Number of unicode characters entered by this node; i.e., the number
@@ -91,7 +99,7 @@ namespace Dasher {
/// (as a symbol or subgroup), any number of levels beneath it
virtual bool isInGroup(const SGroupInfo *pGroup)=0;
};
- class CGroupNode;
+
///Additionally stores LM contexts and probabilities calculated therefrom
class CAlphNode : public CAlphBase {
public:
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index 072cfa3..f556ad4 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -94,6 +94,8 @@ CMandarinAlphMgr::CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreati
DASHER_ASSERT(vSyms.size()==1 && vSyms[0]!=0); //i.e. conversion is exactly one chinese symbol
DASHER_ASSERT(m_pCHAlphabet->GetText(vSyms[0]) == *it);
m_pConversionsBySymbol[i].insert(vSyms[0]);
+ //Also the reverse lookup: (valid/used chinese symbol number) -> (pinyin by which it could be produced)
+ m_PinyinByChinese[vSyms[0]].insert(i);
}
}
//that leaves m_pConversionsBySymbol as desired.
@@ -187,14 +189,12 @@ int CMandarinAlphMgr::CConvRoot::ExpectedNumChildren() {
}
void CMandarinAlphMgr::CConvRoot::PopulateChildren() {
+ PopulateChildrenWithExisting(NULL);
+}
+
+void CMandarinAlphMgr::CConvRoot::PopulateChildrenWithExisting(CMandSym *existing) {
if (m_vChInfo.empty()) {
- const set<symbol> &convs(mgr()->m_pConversionsBySymbol[m_pySym]);
- for(set<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
- m_vChInfo.push_back(std::pair<symbol, unsigned int>(*it,0));
- }
- //ACL I think it's a good idea to keep those in a consistent order - symbol order will do nicely
- sort(m_vChInfo.begin(),m_vChInfo.end());
- mgr()->AssignSizes(m_vChInfo, iContext);
+ mgr()->GetConversions(m_vChInfo,m_pySym, iContext);
}
int iCum(0);
@@ -205,10 +205,11 @@ void CMandarinAlphMgr::CConvRoot::PopulateChildren() {
const unsigned int iLbnd(iCum), iHbnd(iCum + it->second);
iCum = iHbnd;
- CMandSym *pNewNode = mgr()->CreateCHSymbol(this, this->iContext, iLbnd, iHbnd, "", it->first, m_pySym);
+ CMandSym *pNewNode = (existing)
+ ? existing->RebuildCHSymbol(this, iLbnd, iHbnd, it->first)
+ : mgr()->CreateCHSymbol(this, this->iContext, iLbnd, iHbnd, "", it->first, m_pySym);
DASHER_ASSERT(GetChildren().back()==pNewNode);
-
}
}
@@ -245,31 +246,29 @@ void CMandarinAlphMgr::CConvRoot::SetFlag(int iFlag, bool bValue) {
CDasherNode::SetFlag(iFlag,bValue);
}
-void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChildren, Dasher::CLanguageModel::Context context) {
+void CMandarinAlphMgr::GetConversions(std::vector<pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context) {
+
+ const set<symbol> &convs(m_pConversionsBySymbol[pySym]);
+ for(set<symbol>::const_iterator it = convs.begin(); it != convs.end(); ++it) {
+ vChildren.push_back(std::pair<symbol, unsigned int>(*it,0));
+ }
+ //ACL I think it's a good idea to keep those in a consistent order - symbol order will do nicely
+ sort(vChildren.begin(),vChildren.end());
const uint64 iNorm(m_pNCManager->GetLongParameter(LP_NORMALIZATION));
const unsigned int uniform((m_pNCManager->GetLongParameter(LP_UNIFORM)*iNorm)/1000);
-
- int iRemaining(iNorm);
-
- uint64 sumProb=0;
-
- //CLanguageModel::Context iCurrentContext;
-
- // std::cout<<"size of symbolstore "<<SymbolStore.size()<<std::endl;
-
- // std::cout<<"norm input: "<<nonuniform_norm/(iSymbols/iNChildren/100)<<std::endl;
-
+
//ACL pass in iNorm and uniform directly - GetPartProbs distributes the last param between
// however elements there are in vChildren...
static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->GetPartProbs(context, vChildren, iNorm, uniform);
//std::cout<<"after get probs "<<std::endl;
+ uint64 sumProb=0;
for (std::vector<pair<symbol,unsigned int> >::const_iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
sumProb += it->second;
}
-
+ DASHER_ASSERT(sumProb==iNorm);
// std::cout<<"Sum Prob "<<sumProb<<std::endl;
// std::cout<<"norm "<<nonuniform_norm<<std::endl;
@@ -279,6 +278,7 @@ void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChi
// std::cout<<"sumProb "<<sumProb<<std::endl;
+ int iRemaining(iNorm);
for (std::vector<pair<symbol,unsigned int> >::iterator it = vChildren.begin(); it!=vChildren.end(); it++) {
DASHER_ASSERT(it->first>-1); //ACL Will's code tested for both these conditions explicitly, and if so
DASHER_ASSERT(sumProb>0); //then used a probability of 0. I don't think either
@@ -299,6 +299,7 @@ void CMandarinAlphMgr::AssignSizes(std::vector<pair<symbol,unsigned int> > &vChi
// std::cout<<"symbols size "<<SymbolStore.size()<<std::endl;
// std::cout<<"Symbols address "<<&SymbolStore<<std::endl;
}
+ DASHER_ASSERT(iRemaining==0);
//std::cout<<"iRemaining "<<iRemaining<<std::endl;
@@ -328,8 +329,7 @@ CMandarinAlphMgr::CMandSym::CMandSym(CDasherNode *pParent, int iOffset, unsigned
}
CDasherNode *CMandarinAlphMgr::CMandSym::RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol) {
- //TODO m_pyParent should have been computed in RebuildParent()
- DASHER_ASSERT(m_pyParent!=0);
+ DASHER_ASSERT(m_pyParent!=0); //should have been computed in RebuildForwardsFromAncestor()
if (iSymbol==m_pyParent) {
//create the PY node that lead to this chinese
if (mgr()->m_pConversionsBySymbol[m_pyParent].size()==1) {
@@ -340,19 +340,63 @@ CDasherNode *CMandarinAlphMgr::CMandSym::RebuildSymbol(CAlphNode *pParent, unsig
}
//ok, will be a PY-to-Chinese conversion choice
CConvRoot *pConv = mgr()->CreateConvRoot(pParent, iLbnd, iHbnd, strGroup, iSymbol);
- //TODO equivalent of IterateChildGroups - make CConvRoot generate children, but replacing one with this
+ pConv->PopulateChildrenWithExisting(this);
return pConv;
}
return CAlphBase::RebuildSymbol(pParent, iLbnd, iHbnd, strGroup, iBkgCol, iSymbol);
}
bool CMandarinAlphMgr::CMandSym::isInGroup(const SGroupInfo *pGroup) {
- //TODO m_pyParent should have been computed in RebuildParent()
- DASHER_ASSERT(m_pyParent!=0);
+ DASHER_ASSERT(m_pyParent!=0); //should have been computed in RebuildForwardsFromAncestor()
//pinyin group contains the pinyin-"symbol"=CConvRoot which we want to be our parent...
return pGroup->iStart <= m_pyParent && pGroup->iEnd > m_pyParent;
}
+CMandarinAlphMgr::CMandSym *CMandarinAlphMgr::CMandSym::RebuildCHSymbol(CConvRoot *pParent, unsigned int iLbnd, unsigned int iHbnd, symbol iNewSym) {
+ if (iNewSym == this->iSymbol) {
+ //reuse existing node
+ SetParent(pParent);
+ SetRange(iLbnd, iHbnd);
+ return this;
+ }
+ return mgr()->CreateCHSymbol(pParent, pParent->iContext, iLbnd, iHbnd, "", iNewSym, pParent->m_pySym);
+}
+
+void CMandarinAlphMgr::CMandSym::RebuildForwardsFromAncestor(CAlphNode *pNewNode) {
+ if (m_pyParent==0) {
+ set<symbol> &possiblePinyin(mgr()->m_PinyinByChinese[iSymbol]);
+ if (possiblePinyin.size() > 1) {
+ //need to compare pinyin symbols; so compute probability of this (chinese) sym, for each:
+ // i.e. P(pinyin) * P(this chinese | pinyin)
+ const vector<unsigned int> &vPinyinProbs(*(pNewNode->GetProbInfo()));
+ long bestProb=0; //of this chinese, over LP_NORMALIZATION _squared_
+ for (set<symbol>::iterator p_it = possiblePinyin.begin(); p_it!=possiblePinyin.end(); p_it++) {
+ //compute probability of each chinese symbol for that pinyin (=by filtering)
+ // context is the same as the ancestor = previous chinese, as pinyin not part of context
+ vector<pair<symbol, unsigned int> > vChineseProbs;
+ mgr()->GetConversions(vChineseProbs, *p_it, pNewNode->iContext);
+ //now find us in that list
+ long thisProb; //i.e. P(this pinyin) * P(this chinese | this pinyin)
+ for (vector<pair<symbol,unsigned int> >::iterator c_it = vChineseProbs.begin(); ;) {
+ if (c_it->first == iSymbol) {
+ //found P(this chinese sym | pinyin). Compute overall...
+ thisProb = c_it->second * vPinyinProbs[*p_it];
+ break;
+ }
+ c_it++;
+ DASHER_ASSERT(c_it!=vChineseProbs.end()); //gotta find this chinese sym somewhere...
+ }
+ //see if that works out better than for the other possible pinyin...
+ if (thisProb > bestProb) {
+ bestProb = thisProb;
+ m_pyParent = *p_it;
+ }
+ }
+ } else m_pyParent = *(possiblePinyin.begin());
+ }
+ CSymbolNode::RebuildForwardsFromAncestor(pNewNode);
+}
+
const std::string &CMandarinAlphMgr::CMandSym::outputText() {
//use chinese, not pinyin, alphabet...
return mgr()->m_pCHAlphabet->GetText(iSymbol);
diff --git a/Src/DasherCore/MandarinAlphMgr.h b/Src/DasherCore/MandarinAlphMgr.h
index 47d610d..31dd4bf 100644
--- a/Src/DasherCore/MandarinAlphMgr.h
+++ b/Src/DasherCore/MandarinAlphMgr.h
@@ -48,6 +48,7 @@ namespace Dasher {
CAlphNode *GetRoot(CDasherNode *pParent, unsigned int iLower, unsigned int iUpper, bool bEnteredLast, int iOffset);
protected:
+ class CConvRoot;
///Subclass of CSymbolNode for (converted) chinese-alphabet symbols:
/// these use the chinese alphabet in place of the pinyin one for text to display/enter,
/// and get their colour using GetCHColour rather than GetColour.
@@ -57,8 +58,11 @@ namespace Dasher {
///Symbol constructor: display text from CHAlphabet, colour from GetCHColour
/// \param strGroup caption of any group(s) containing this symbol for which no nodes created; prepended to display text.
CMandSym(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, CMandarinAlphMgr *pMgr, symbol iSymbol, symbol pyParent);
- protected:
CDasherNode *RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol);
+ CMandSym *RebuildCHSymbol(CConvRoot *pParent, unsigned int iLbnd, unsigned int iHbnd, symbol iNewSym);
+ protected:
+ ///Override to compute which pinyin symbol to make our parent...
+ void RebuildForwardsFromAncestor(CAlphNode *pNewNode);
bool isInGroup(const SGroupInfo *pGroup);
private:
virtual const std::string &outputText();
@@ -74,15 +78,16 @@ namespace Dasher {
CConvRoot(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, CMandarinAlphMgr *pMgr, symbol pySym);
CMandarinAlphMgr *mgr() {return static_cast<CMandarinAlphMgr *>(CAlphBase::mgr());}
void PopulateChildren();
+ void PopulateChildrenWithExisting(CMandSym *existing);
int ExpectedNumChildren();
CLanguageModel::Context iContext;
void SetFlag(int iFlag, bool bValue);
+ const symbol m_pySym;
CDasherNode *RebuildSymbol(CAlphNode *pParent, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, int iBkgCol, symbol iSymbol);
protected:
bool isInGroup(const SGroupInfo *pGroup);
private:
std::vector<std::pair<symbol, unsigned int> > m_vChInfo;
- symbol m_pySym;
};
///Called to create the node for a pinyin leaf symbol;
/// Overridden to call either CreateConvRoot or CreateCHSymbol, according to #chinese symbols under specified pinyin
@@ -105,7 +110,11 @@ namespace Dasher {
/// \param pyParent pinyin-alphabet symbol which was used to enter this chinese symbol (if known, else 0)
CMandSym *CreateCHSymbol(CDasherNode *pParent, CLanguageModel::Context iContext, unsigned int iLbnd, unsigned int iHbnd, const std::string &strGroup, symbol iCHsym, symbol pyParent);
- void AssignSizes(std::vector<std::pair<symbol,unsigned int> > &vChildren, Dasher::CLanguageModel::Context context);
+ ///Gets the possible chinese symbols for a pinyin one, along with their probabilities in the specified context.
+ ///Probabilities are computed by CPPMPYLanguageModel::GetPartProbs, then renormalized here. (TODO unnecessary?)
+ /// \param vChildren initially empty vector which procedure fills with pairs: first element chinese symbol number,
+ /// second element probability (/LP_NORMALIZATION).
+ void GetConversions(std::vector<std::pair<symbol,unsigned int> > &vChildren, symbol pySym, Dasher::CLanguageModel::Context context);
///Gets colour for a specified chinese symbol and offset.
/// Wraps CHalphabet getcolour in case anything specified; if not,
@@ -118,6 +127,12 @@ namespace Dasher {
///Indexed by SPY (syll+tone) alphabet symbol number,
// the set of CHAlphabet symbols it can be converted to.
std::set<symbol> *m_pConversionsBySymbol;
+
+ ///Indexed by chinese-alphabet symbol number (sparsely: where multiple
+ /// chinese-alphabet symbols have the same text, we use only the one
+ /// returned by CAlphabetMap::GetSymbols() for that text)
+ /// the set of pinyin syllable+tones which could yield that symbol.
+ std::map<symbol,std::set<symbol> > m_PinyinByChinese;
};
/// @}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]