[dasher: 24/38] New alternative to Mandarin/PPMPY: RoutingAlphMgr & RoutingPPMLanguageModel
- From: Patrick Welche <pwelche src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [dasher: 24/38] New alternative to Mandarin/PPMPY: RoutingAlphMgr & RoutingPPMLanguageModel
- Date: Tue, 3 Jan 2012 15:34:40 +0000 (UTC)
commit b67ddca45df9876e0abd105f8832a27e60e038f4
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Sun Dec 18 22:11:40 2011 +0000
New alternative to Mandarin/PPMPY: RoutingAlphMgr & RoutingPPMLanguageModel
Attempts to learn likely upcoming symbols, independently from how you like to
enter said symbols => should deal better with unannotated training texts, and
learn correlations better. (Still no game mode!)
Used for conversionid=3 or 4: assumes the route by which you write a symbol,
is context-independent in the former, context-dependent in the latter.
Src/DasherCore/Alphabet/AlphInfo.h | 2 +-
Src/DasherCore/LanguageModelling/Makefile.am | 2 +
.../LanguageModelling/RoutingPPMLanguageModel.cpp | 205 ++++++++++++++++++++
.../LanguageModelling/RoutingPPMLanguageModel.h | 93 +++++++++
Src/DasherCore/Makefile.am | 2 +
Src/DasherCore/NodeCreationManager.cpp | 9 +
Src/DasherCore/RoutingAlphMgr.cpp | 193 ++++++++++++++++++
Src/DasherCore/RoutingAlphMgr.h | 117 +++++++++++
Src/MacOSX/Dasher.xcodeproj/project.pbxproj | 16 ++
9 files changed, 638 insertions(+), 1 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 9b5d9fe..12d3885 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -121,7 +121,7 @@ public:
///Single-unicode characters used in the training file to delimit the name of a group
/// containing the next symbol, in order to disambiguate which group (=route, pronunciation)
/// was used to produce the symbol in this case (see MandarinTrainer).
- /// Only used if m_iConversionID==2. Default to "<" and ">"
+ /// Only used if m_iConversionID==2, 3 or 4. Default to "<" and ">"
std::string m_strConversionTrainStart,m_strConversionTrainStop;
~CAlphInfo();
diff --git a/Src/DasherCore/LanguageModelling/Makefile.am b/Src/DasherCore/LanguageModelling/Makefile.am
index 4ac9b80..b43e9d5 100644
--- a/Src/DasherCore/LanguageModelling/Makefile.am
+++ b/Src/DasherCore/LanguageModelling/Makefile.am
@@ -13,5 +13,7 @@ libdasherlm_a_SOURCES = \
PPMLanguageModel.h \
PPMPYLanguageModel.cpp \
PPMPYLanguageModel.h \
+ RoutingPPMLanguageModel.cpp \
+ RoutingPPMLanguageModel.h \
WordLanguageModel.cpp \
WordLanguageModel.h
diff --git a/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp
new file mode 100644
index 0000000..0737869
--- /dev/null
+++ b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp
@@ -0,0 +1,205 @@
+//
+// RoutingPPMLanguageModel.cpp
+// Dasher
+//
+// Created by Alan Lawrence on 13/12/11.
+// Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#include "RoutingPPMLanguageModel.h"
+
+using namespace Dasher;
+using namespace std;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+/////////////////////////////////////////////////////////////////////
+
+CRoutingPPMLanguageModel::CRoutingPPMLanguageModel(CSettingsUser *pCreator, const vector<symbol> *pBaseSyms, const vector<set<symbol> > *pRoutes, bool bRoutesContextSensitive)
+:CAbstractPPM(pCreator, pRoutes->size()-1, new CRoutingPPMnode(-1), GetLongParameter(LP_LM_MAX_ORDER)), NodesAllocated(0), m_NodeAlloc(8192), m_pBaseSyms(pBaseSyms), m_pRoutes(pRoutes), m_bRoutesContextSensitive(bRoutesContextSensitive) {
+ DASHER_ASSERT(pBaseSyms->size() >= pRoutes->size());
+}
+
+void CRoutingPPMLanguageModel::GetProbs(Context context, std::vector<unsigned int> &probs, int norm, int iUniform) const {
+ const CPPMContext *ppmcontext = (const CPPMContext *)(context);
+
+ const int iNumSymbols(m_pBaseSyms->size()); //i.e., the #routes - so loop from i=1 to <iNumSymbols
+ probs.resize(iNumSymbols);
+
+ unsigned int iToSpend = norm;
+ unsigned int iUniformLeft = iUniform;
+
+ // TODO: Sort out zero symbol case
+ probs[0] = 0;
+
+ for(int i = 1; i < iNumSymbols; i++) {
+ probs[i] = iUniformLeft / (iNumSymbols - i);
+ iUniformLeft -= probs[i];
+ iToSpend -= probs[i];
+ }
+
+ DASHER_ASSERT(iUniformLeft == 0);
+
+ int alpha = GetLongParameter( LP_LM_ALPHA );
+ int beta = GetLongParameter( LP_LM_BETA );
+
+ //first, fill out the probabilities of the base symbols, as per ordinary PPM
+ // (TODO, could move CPPMLanguageModel::GetProbs into CAbstractPPM, would do
+ // this for us?)
+ vector<unsigned int> baseProbs(GetSize()); //i.e. # base symbols
+ for (CPPMnode *pTemp = ppmcontext->head; pTemp; pTemp = pTemp->vine) {
+ int iTotal = 0;
+ for (ChildIterator it=pTemp->children(); it!=pTemp->end(); it++)
+ iTotal += (*it)->count;
+
+ if(iTotal) {
+ unsigned int size_of_slice = iToSpend;
+
+ for (ChildIterator it=pTemp->children(); it!=pTemp->end(); it++) {
+ unsigned int p = static_cast < myint > (size_of_slice) * (100 * (*it)->count - beta) / (100 * iTotal + alpha);
+
+ baseProbs[(*it)->sym] += p;
+ iToSpend -= p;
+
+ // Usprintf(debug,TEXT("sym %u counts %d p %u tospend %u \n"),sym,s->count,p,tospend);
+ // DebugOutput( debug);
+ }
+ }
+ }
+
+ //anything left over, distribute evenly...
+ for (int i=1,iLeft=GetSize()-1; i<GetSize(); i++,iLeft--) {
+ unsigned int p = iToSpend / iLeft;
+ baseProbs[i] += p;
+ iToSpend -= p;
+ }
+ DASHER_ASSERT(iToSpend == 0);
+
+ //second, use those figures as the _total_ to divide up between the routes
+ // _for_each_base_symbol_.
+ for (CPPMnode *pTemp = ppmcontext->head; pTemp; pTemp=pTemp->vine) {
+ if (pTemp!=m_pRoot && !m_bRoutesContextSensitive) continue;
+
+ for (ChildIterator it = pTemp->children(); it!=pTemp->end(); it++) {
+ const CRoutingPPMnode *pNode(static_cast<CRoutingPPMnode*>(*it));
+ int iTotal=0; //total for base symbol corresponding to child (at this level of PPM tree)
+ for (map<symbol,unsigned short int>::const_iterator it2=pNode->m_routes.begin(); it2!=pNode->m_routes.end(); it2++)
+ iTotal += it2->second;
+ if (iTotal) {
+ //divvy up some of baseProbs according to the distribution
+ // of pNode->m_routes
+ unsigned int size_of_slice = baseProbs[pNode->sym];
+ for (map<symbol,unsigned short int>::const_iterator it2=pNode->m_routes.begin(); it2!=pNode->m_routes.end(); it2++) {
+ unsigned int p = size_of_slice * (100 * it2->second - beta) / (100*iTotal + alpha);
+ probs[it2->first] += p;
+ baseProbs[pNode->sym] -= p;
+ }
+ }
+ }
+ }
+
+ //for each base, distribute any remaining probability mass
+ // uniformly to all the routes to that base.
+ for (int i=1; i<GetSize(); i++) {
+ if (!baseProbs[i]) continue; //=already distributed
+
+ //ok, so there's some probability mass assigned to the base symbol,
+ // which we haven't assigned to any route
+ const set<symbol> &routes((*m_pRoutes)[i]);
+ //divide it up evenly
+ int iLeft = routes.size();
+ for (set<symbol>::iterator it = routes.begin(); it!=routes.end(); it++) {
+ unsigned int p = baseProbs[i] / iLeft;
+ probs[*it] += p;
+ baseProbs[i] -= p;
+ --iLeft;
+ }
+ DASHER_ASSERT(baseProbs[i]==0);
+ }
+}
+
+/////////////////////////////////////////////////////////////////////
+
+symbol CRoutingPPMLanguageModel::GetBestRoute(Context ctx) {
+ const CPPMContext *context = (const CPPMContext *)ctx;
+ DASHER_ASSERT(context->head && context->head != m_pRoot);
+
+ map<symbol,unsigned int> probs; //of the routes leading to this base sym
+ int iToSpend = 1<<16; //arbitrary, could be anything
+ int alpha = GetLongParameter(LP_LM_ALPHA), beta=GetLongParameter(LP_LM_BETA);
+
+ for (CPPMnode *pTemp = context->head; pTemp!=m_pRoot; pTemp=pTemp->vine) {
+ if (pTemp->vine!=m_pRoot && !m_bRoutesContextSensitive) continue;
+
+ const CRoutingPPMnode *node(static_cast<CRoutingPPMnode*>(pTemp));
+ unsigned long iTotal=0;
+ for (map<symbol,unsigned short int>::const_iterator it=node->m_routes.begin(); it!=node->m_routes.end(); it++)
+ iTotal += it->second;
+ if (!iTotal) continue;
+ const int size_of_slice(iToSpend);
+ for (map<symbol,unsigned short int>::const_iterator it=node->m_routes.begin(); it!=node->m_routes.end(); it++) {
+ unsigned int p = size_of_slice * (100*it->second - beta) / (100*iTotal+ alpha);
+ iToSpend-=p;
+ probs[it->first]+=p;
+ }
+ }
+ //Could divvy up rest uniformly...but there's no point, this won't affect
+ // which is most likely! (Except by rounding error, i.e. if iToSpend can't
+ // be divided evenly between the routes. But let's not worry about that,
+ // the worst that can happen is the user ends up in a different, very-nearly-equally-sized,
+ // box)
+
+ pair<symbol,unsigned int> best;//initially (0,0)
+ for (map<symbol, unsigned int>::iterator it=probs.begin(); it!=probs.end(); it++) {
+ DASHER_ASSERT((*m_pRoutes)[context->head->sym].count(it->first));
+ if (it->second>best.second) best=*it;
+ }
+
+ if (best.second) return best.first;
+ //no data. pick one at random
+ const set<symbol> &options((*m_pRoutes)[context->head->sym]);
+ //in fact, (very pseudo)-random:
+ return *(options.begin());
+}
+
+void CRoutingPPMLanguageModel::LearnBaseSymbol(Context c, int baseSym) {
+ CAbstractPPM::LearnSymbol(c, baseSym);
+}
+
+void CRoutingPPMLanguageModel::LearnSymbol(Context ctx, int sym) {
+ int base = m_pBaseSyms->at(sym);
+ LearnBaseSymbol(ctx, base);
+ //ctx now updated, points to node for learnt base sym
+ DASHER_ASSERT((*m_pRoutes)[base].size());
+ if ((*m_pRoutes)[base].size()==1) return; //no need to store, saves computation if we don't
+ for (CPPMnode *node=((CPPMContext*)ctx)->head; node!=m_pRoot; node=node->vine) {
+ if (node->vine!=m_pRoot && !m_bRoutesContextSensitive) continue;
+ else if (static_cast<CRoutingPPMnode*>(node)->m_routes[sym]++) //returns old value, i.e. 0 if not present
+ if (bUpdateExclusion) break;
+ }
+}
+
+CRoutingPPMLanguageModel::CRoutingPPMnode *CRoutingPPMLanguageModel::makeNode(int sym) {
+ CRoutingPPMnode *res = m_NodeAlloc.Alloc();
+ res->sym=sym;
+ ++NodesAllocated;
+ return res;
+}
+
+//Mandarin - PY not enabled for these read-write functions
+bool CRoutingPPMLanguageModel::WriteToFile(std::string strFilename) {
+ return false;
+}
+
+//Mandarin - PY not enabled for these read-write functions
+bool CRoutingPPMLanguageModel::ReadFromFile(std::string strFilename) {
+ return false;
+}
diff --git a/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h
new file mode 100644
index 0000000..ffe1c62
--- /dev/null
+++ b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h
@@ -0,0 +1,93 @@
+//
+// RoutingPPMLanguageModel.h
+// Dasher
+//
+// Created by Alan Lawrence on 13/12/11.
+// Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#ifndef __RoutingPPMLanguageModel_h__
+#define __RoutingPPMLanguageModel_h__
+
+#include "PPMLanguageModel.h"
+
+#include <set>
+
+namespace Dasher {
+
+ ///
+ /// \ingroup LM
+ /// @{
+
+ ///
+ /// Routing Language Model: tries to independently learn a sequence of 'base'
+ /// symbols, as per PPM, and also which of multiple routes are used to enter them;
+ /// predicts probabilities for (base*route) by dividing probability for base sym
+ /// (as per first part of model) up between its possible routes according to
+ /// second part of model.
+ ///
+ /// All contexts are base syms only, so extends PPM over base sym; but overrides
+ /// GetProbs to return larger array of probs over (base*route), also LearnSymbol
+ /// (to learn base*route; if only base is available, call LearnBaseSymbol).
+ /// EnterSymbol (which doesn't do learning) takes base symbols (for context) only.
+ class CRoutingPPMLanguageModel : public CAbstractPPM {
+ public:
+ /// \param pBaseSyms vector identifying the base symbol for each (base+route).
+ /// Thus, size indicates the number of (base+route)s.
+ /// \param pRoutes vector identifying all possible route#s for each base sym
+ /// Thus, size indicates the number of base syms.
+ /// \param bRoutesContextSensitive if false, the distribution over (routes by
+ /// which a given symbol is entered) is considered independent of context;
+ /// if true, likely routes are learnt according to the preceding context.
+ CRoutingPPMLanguageModel(CSettingsUser *pCreator, const std::vector<symbol> *pBaseSyms, const std::vector<std::set<symbol> > *pRoutes, bool bRoutesContextSensitive);
+
+ /// Learns a base symbol (but not which route we are likely to enter it by).
+ /// Includes moving on the context to include that base sym.
+ void LearnBaseSymbol(Context context, int Symbol);
+
+ /// Learns a base+route, including moving the context on to include the base.
+ void LearnSymbol(Context context, int Symbol);
+
+ ///Note we can only ever enter base symbols.
+
+ ///Returns the most likely route by which a symbol might have been entered
+ /// \param ctx context whose most-recent character identifies the base symbol
+ /// in which we are interested.
+ symbol GetBestRoute(Context ctx);
+
+ ///Predicts probabilities for all (base*route)s.
+ /// \param Probs vector to fill with predictions; will be filled m_pBaseSyms->size()
+ /// elements (including initial 0)
+ virtual void GetProbs(Context context, std::vector < unsigned int >&Probs, int norm, int iUniform) const;
+
+ ///disable file i/o
+ virtual bool WriteToFile(std::string strFilename);
+ virtual bool ReadFromFile(std::string strFilename);
+
+ protected:
+ ///Subclass to additionally store counts of route by which this context (i.e.
+ /// the last base symbol within) was entered, when we know that.
+ class CRoutingPPMnode : public CPPMnode {
+ public:
+ ///map from route (to the last base sym only) to count by which that route
+ /// was definitely used.
+ std::map<symbol,unsigned short int> m_routes;
+ inline CRoutingPPMnode(int sym) : CPPMnode(sym) {}
+ inline CRoutingPPMnode() : CPPMnode() {}
+ };
+ ///Always returns a CRoutingPPMnode. TODO, work through class and use standard
+ /// map-less PPMnodes for unambiguous base syms (which have only one route) ?
+ CRoutingPPMnode *makeNode(int sym);
+
+ private:
+ int NodesAllocated;
+ CSimplePooledAlloc < CRoutingPPMnode > m_NodeAlloc;
+ const std::vector<symbol> *m_pBaseSyms;
+ const std::vector<std::set<symbol> > *m_pRoutes;
+ const bool m_bRoutesContextSensitive;
+ };
+
+ /// @}
+} // end namespace Dasher
+
+#endif
diff --git a/Src/DasherCore/Makefile.am b/Src/DasherCore/Makefile.am
index d47533e..dfa344e 100644
--- a/Src/DasherCore/Makefile.am
+++ b/Src/DasherCore/Makefile.am
@@ -105,6 +105,8 @@ libdashercore_a_SOURCES = \
OneButtonFilter.h \
OneDimensionalFilter.cpp \
OneDimensionalFilter.h \
+ RoutingAlphMgr.cpp \
+ RoutingAlphMgr.h \
SCENode.cpp \
SCENode.h \
ScreenGameModule.cpp \
diff --git a/Src/DasherCore/NodeCreationManager.cpp b/Src/DasherCore/NodeCreationManager.cpp
index f1c716a..26c333d 100644
--- a/Src/DasherCore/NodeCreationManager.cpp
+++ b/Src/DasherCore/NodeCreationManager.cpp
@@ -2,6 +2,7 @@
#include "DasherInterfaceBase.h"
#include "NodeCreationManager.h"
#include "MandarinAlphMgr.h"
+#include "RoutingAlphMgr.h"
#include "ConvertingAlphMgr.h"
#include "ControlManager.h"
#include "Observable.h"
@@ -78,6 +79,14 @@ CNodeCreationManager::CNodeCreationManager(CSettingsUser *pCreateFrom,
//(ACL) Modify AlphabetManager for Mandarin Dasher
m_pAlphabetManager = new CMandarinAlphMgr(this, pInterface, this, pAlphInfo);
break;
+ case 3: //these differ only in that conversion id 3 assumes the route by which
+ case 4: //the user writes a symbol, is not dependent on context (e.g. just user preference),
+ //whereas 4 assumes it does depend on context (e.g. phonetic chinese)
+ m_pAlphabetManager = new CRoutingAlphMgr(this, pInterface, this, pAlphInfo);
+ break;
+ //TODO: we could even just switch from standard alphmgr, to case 3, automatically
+ // if the alphabet has repeated symbols; and thus do away with much of the "conversionid"
+ // tag (just a flag for context-sensitivity, and maybe the start/stop delimiters?)
}
//all other configuration changes, etc., that might be necessary for a particular conversion mode,
// are implemented by AlphabetManager subclasses overriding the following two methods:
diff --git a/Src/DasherCore/RoutingAlphMgr.cpp b/Src/DasherCore/RoutingAlphMgr.cpp
new file mode 100644
index 0000000..204ae7a
--- /dev/null
+++ b/Src/DasherCore/RoutingAlphMgr.cpp
@@ -0,0 +1,193 @@
+//
+// RoutingAlphMgr.cpp
+// Dasher
+//
+// Created by Alan Lawrence on 13/12/11.
+// Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#include "RoutingAlphMgr.h"
+#include "DasherInterfaceBase.h"
+using namespace std;
+using namespace Dasher;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG_MEMLEAKS
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+CRoutingAlphMgr::CRoutingAlphMgr(CSettingsUser *pCreator, CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet)
+: CAlphabetManager(pCreator, pInterface, pNCManager, pAlphabet) {
+
+ DASHER_ASSERT(pAlphabet->m_iConversionID==3 || pAlphabet->m_iConversionID==4);
+}
+
+void CRoutingAlphMgr::InitMap() {
+ m_vBaseSyms.reserve(m_pAlphabet->iEnd); m_vBaseSyms.push_back(0); //base for unknown route = unknown!
+ m_vRoutes.push_back(set<symbol>()); //unknown base symbol has no routes
+ for (int i=1; i<m_pAlphabet->iEnd; i++) {
+ symbol s = m_map.Get(m_pAlphabet->GetText(i));
+ if (s==0) {
+ s=m_vRoutes.size();
+ m_vRoutes.push_back(set<symbol>());
+ m_map.Add(m_pAlphabet->GetText(i),s);
+ }
+ m_vBaseSyms.push_back(s);
+ m_vRoutes[s].insert(i);
+ }
+ m_vGroupsByRoute.resize(m_vBaseSyms.size());
+ vector<const SGroupInfo *> vGroups;
+ DASHER_ASSERT(!m_pAlphabet->pNext);
+ vGroups.push_back(m_pAlphabet->pChild);
+ while (!vGroups.empty()) {
+ const SGroupInfo *g(vGroups.back()); vGroups.pop_back();
+ if (!g) continue;
+ for (int i=g->iStart; i<g->iEnd; i++) m_vGroupsByRoute[i]=g;
+ vGroups.push_back(g->pNext);
+ vGroups.push_back(g->pChild);
+ }
+}
+
+void CRoutingAlphMgr::CreateLanguageModel() {
+ m_pLanguageModel = new CRoutingPPMLanguageModel(this, &m_vBaseSyms, &m_vRoutes, m_pAlphabet->m_iConversionID==4);
+}
+
+string CRoutingAlphMgr::CRoutedSym::trainText() {
+ const set<symbol> &routes(mgr()->m_vRoutes[mgr()->m_vBaseSyms[iSymbol]]);
+ DASHER_ASSERT(routes.count(iSymbol));
+ string t=CSymbolNode::trainText();
+ if (routes.size()!=1)
+ if (const SGroupInfo *g = mgr()->m_vGroupsByRoute[iSymbol])
+ return mgr()->m_pAlphabet->m_strConversionTrainStart + g->strName + mgr()->m_pAlphabet->m_strConversionTrainStop + t;
+ return t;
+}
+
+CRoutingAlphMgr::CRoutedSym::CRoutedSym(int iOffset, CDasherScreen::Label *pLabel, CRoutingAlphMgr *pMgr, symbol iSymbol)
+: CSymbolNode(iOffset, pLabel, pMgr, iSymbol) {
+};
+
+
+CAlphabetManager::CAlphNode *CRoutingAlphMgr::CreateSymbolRoot(int iOffset, CLanguageModel::Context ctx, symbol sym) {
+ //sym is from the map, so a base symbol. It's at the end of the context,
+ // TODO unless this is the completely-empty context,
+ // so ask the LM for which way it's most likely to have been entered
+ sym = static_cast<CRoutingPPMLanguageModel*>(m_pLanguageModel)->GetBestRoute(ctx);
+ return new CRoutedSym(iOffset, m_vLabels[sym], this, sym);
+}
+
+int CRoutingAlphMgr::GetColour(symbol route, int iOffset) const {
+ int iColour = m_pAlphabet->GetColour(route); //colours were rehashed with CH symbol text
+ if (iColour==-1) {
+ //none specified in alphabet
+ static int colourStore[2][3] = {
+ {66,//light blue
+ 64,//very light green
+ 62},//light yellow
+ {78,//light purple
+ 81,//brownish
+ 60},//red
+ };
+ return colourStore[iOffset&1][route % 3];
+ }
+ if ((iOffset&1)==0 && iColour<130) iColour+=130;
+ return iColour;
+}
+
+
+CDasherNode *CRoutingAlphMgr::CreateSymbolNode(CAlphNode *pParent, symbol iSymbol) {
+
+ int iNewOffset = pParent->offset()+1;
+ if (m_pAlphabet->GetText(iSymbol)=="\r\n") iNewOffset++;
+ CSymbolNode *pAlphNode = new CRoutedSym(iNewOffset, m_vLabels[iSymbol], this, iSymbol);
+
+ pAlphNode->iContext = m_pLanguageModel->CloneContext(pParent->iContext);
+
+ //namely, we want to enter only the BASE symbol into the LM, not the route
+ // (which would be out of range):
+ m_pLanguageModel->EnterSymbol(pAlphNode->iContext, m_vBaseSyms[iSymbol]);
+ // (Unfortunately, we can't make EnterSymbol take route numbers, because
+ // it has base symbols passed to it from the alphabet map)
+ return pAlphNode;
+
+}
+
+CRoutingAlphMgr::CRoutingTrainer::CRoutingTrainer(CMessageDisplay *pMsgs, CRoutingAlphMgr *pMgr)
+: CTrainer(pMsgs, pMgr->m_pLanguageModel, pMgr->m_pAlphabet, &pMgr->m_map), m_pMgr(pMgr) {
+
+ m_iStartSym=0;
+ vector<symbol> trainStartSyms;
+ m_pAlphabet->GetSymbols(trainStartSyms, m_pInfo->m_strConversionTrainStart);
+ if (trainStartSyms.size()==1)
+ m_iStartSym = trainStartSyms[0];
+ else
+ m_pMsgs->FormatMessageWithString(_("Warning: faulty alphabet definition: training-start delimiter %s must be a single unicode character. May be unable to process training file."),
+ m_pInfo->m_strConversionTrainStart.c_str());
+}
+
+symbol CRoutingAlphMgr::CRoutingTrainer::getRoute(bool bHaveRoute, const string &strRoute, symbol baseSym) {
+ const set<symbol> &candidates(m_pMgr->m_vRoutes.at(baseSym));
+ set<symbol> named;
+ for (set<symbol>::iterator it=candidates.begin(); it!=candidates.end(); it++)
+ if (const SGroupInfo *g=m_pMgr->m_vGroupsByRoute[*it])
+ if (g->strName == strRoute)
+ named.insert(*it);
+ //if no name was given, but a single group with no name exists, use it!
+ if (named.size()==1) return *(named.begin());
+ //otherwise, we will not learn a route - but this is fine, we can learn
+ // that later more-or-less independently
+
+ if (bHaveRoute) {
+ m_pMsgs->FormatMessageWith2Strings((named.size()==0)
+ ? _("Warning: training file contains character '%s' as member of group '%s', but no group of that name contains the character. Ignoring group specifier.")
+ : _("Warning: training file contains character '%s' as member of group '%s', but alphabet contains several such groups. Dasher will not be able to learn how you want to write this character."),
+ m_pInfo->GetDisplayText(baseSym).c_str(),
+ strRoute.c_str());
+ }
+ // don't flag a problem if no route specified
+
+ return 0;
+}
+
+void CRoutingAlphMgr::CRoutingTrainer::Train(CAlphabetMap::SymbolStream &syms) {
+ CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
+
+ string strRoute; bool bHaveRoute(false);
+ for (symbol sym; (sym=syms.next(m_pAlphabet))!=-1;) {
+ if (sym == m_iStartSym) {
+ if (sym!=0 || syms.peekBack()==m_pInfo->m_strConversionTrainStart) {
+ if (bHaveRoute)
+ m_pMsgs->FormatMessageWithString(_("Warning: in training file, annotation '<%s>' is followed by another annotation and will be ignored"),
+ strRoute.c_str());
+ strRoute.clear(); bHaveRoute=true;
+ for (string s; (s=syms.peekAhead()).length(); strRoute+=s) {
+ syms.next(m_pAlphabet);
+ if (s==m_pInfo->m_strConversionTrainStop) break;
+ }
+ continue; //read next, hopefully a CH (!)
+ } //else, unknown symbol, but does not match pinyin delimiter; fallthrough
+ }
+ if (readEscape(trainContext, sym, syms)) continue; //TODO warn if py lost?
+ //OK, sym is a (CH) symbol to learn.
+ if (sym) {
+ if (symbol route = getRoute(bHaveRoute, strRoute, sym))
+ m_pLanguageModel->LearnSymbol(trainContext, route);
+ else
+ static_cast<CRoutingPPMLanguageModel*>(m_pLanguageModel)->LearnBaseSymbol(trainContext, sym);
+ } //else, silently drop - as standard CTrainer
+ bHaveRoute=false; strRoute.clear();
+ }
+ m_pLanguageModel->ReleaseContext(trainContext);
+}
+
+
+CTrainer *CRoutingAlphMgr::GetTrainer() {
+ //We pass in the pinyin alphabet to define the context-switch escape character, and the default context.
+ // Although the default context will be symbolified via the _chinese_ alphabet, this seems reasonable
+ // as it is the Pinyin alphabet which defines the conversion mapping (i.e. m_strConversionTarget!)
+ return new CRoutingTrainer(m_pInterface, this);
+}
diff --git a/Src/DasherCore/RoutingAlphMgr.h b/Src/DasherCore/RoutingAlphMgr.h
new file mode 100644
index 0000000..b36f65b
--- /dev/null
+++ b/Src/DasherCore/RoutingAlphMgr.h
@@ -0,0 +1,117 @@
+//
+// RoutingAlphMgr.h
+// Dasher
+//
+// Created by Alan Lawrence on 13/12/11.
+// Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#ifndef __RoutingAlphMgr_h__
+#define __RoutingAlphMgr_h__
+
+#include "../Common/Common.h"
+
+#include "AlphabetManager.h"
+#include "LanguageModelling/RoutingPPMLanguageModel.h"
+
+namespace Dasher {
+
+ class CDasherInterfaceBase;
+
+ /// \ingroup Model
+ /// @{
+
+ ///An AlphabetManager that works with alphabets containing duplicate symbols;
+ /// hence, an alternative to MandarinAlphMgr. Uses a RoutingPPMLanguageModel,
+ /// to separately learn both the output symbols and the ways the user wishes
+ /// to write them.
+ /// The alphabet + group structure is presented to the user exactly as per
+ /// alphabet definition, including repeated symbols, sized as per the LM; groups
+ /// play no part in modelling, just being sized to fit around their contents,
+ /// as per standard Dasher. However, just as in MandarinAlphMgr, the names of
+ /// the groups are used in training files to disambiguate which route was used
+ /// to enter a symbol - see nested class CRoutingTrainer. (The aim is that both
+ /// this and CMandarinAlphMgr can be used with the same training files.)
+ ///
+ /// Note we use the term 'base' or 'base symbol' to indicate a particular character
+ /// appearing in the output; this may appear multiple times in the alphabet, in
+ /// which case each occurrence is called a 'route'.
+ ///
+ /// This class is used for alphabets with conversionid 3 or 4; the former differs
+ /// in treating the route by which the user likes to enter a particular base symbol,
+ /// as not dependent on context.
+ class CRoutingAlphMgr : public CAlphabetManager {
+ public:
+ /// Create a RoutingAlphMgr! Changes are in InitMap() and CreateLanguageModel()...
+ CRoutingAlphMgr(CSettingsUser *pCreator, CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet);
+
+ ///Override to return a CRoutingTrainer
+ CTrainer *GetTrainer();
+
+ ///Disable game mode. The target sentence might appear in several places...!!
+ CWordGeneratorBase *GetGameWords() {return NULL;}
+
+ protected:
+ ///Fills map w/ rehashed base symbols, filling m_vBaseSyms, m_vRoutes,
+ /// and m_vGroupsByRoute to record which symbols were identified together.
+ void InitMap();
+ ///Override to create a RoutingPPMLanguageModel
+ void CreateLanguageModel();
+
+ ///Creates a symbol, i.e. including route.
+ /// Both ctx and sym were reconstructed from m_map (filled by InitMap), so
+ /// are in terms of hashed base symbols; thus, this method identifies the best
+ /// route by which that base may have been entered, and creates a symbol node
+ /// for that.
+ CAlphNode *CreateSymbolRoot(int iOffset, CLanguageModel::Context ctx, symbol sym);
+
+ /// Override to create a CRoutedSym and enter only base sym into the LM
+ ///\param iSymbol symbol number from the alphabet defn, i.e. identifies both
+ /// base symbol and route
+ virtual CDasherNode *CreateSymbolNode(CAlphNode *pParent, symbol iSymbol);
+
+ ///Subclass to override trainText
+ class CRoutedSym : public CSymbolNode {
+ public:
+ string trainText();
+ CRoutedSym(int iOffset, CDasherScreen::Label *pLabel, CRoutingAlphMgr *pMgr, symbol iSymbol);
+ protected:
+ CRoutingAlphMgr *mgr() const {return static_cast<CRoutingAlphMgr*>(m_pMgr);}
+ };
+ ///Override to provide different defaults! Otherwise as GetColour,
+ /// this uses the character data in the alphabet anyway.
+ int GetColour(symbol CHsym, int iOffset) const;
+ private:
+ ///for each (not necessarily unique) symbol in the alphabet, the id of the unique base symbol with that text
+ std::vector<symbol> m_vBaseSyms;
+ ///for each base symbol, the symbol#'s of all syms-with-routes with that text
+ std::vector<std::set<symbol> > m_vRoutes;
+ ///closest containing group for each route
+ std::vector<const SGroupInfo*> m_vGroupsByRoute;
+
+ /// Trains a RoutingPPMLanguageModel. Just as for MandarinAlphMgr/PPMPY, the
+ /// training file is expected to consist of a sequence of CH syms (+ context
+ /// switch commands), where CH syms may be preceded by annotations <py>
+ /// (angle brackets are the default delimiters, alternatives may be provided
+ /// in the start & stop attributes of the alphabet conversionid tag). The PY
+ /// should identify exactly one group containing the following CH symbol (or
+ /// will be ignored, but the LM handles ambiguous base symbols where no route
+ /// is specified, somewhat better than PPMPY).
+ class CRoutingTrainer : public CTrainer {
+ public:
+ CRoutingTrainer(CMessageDisplay *pMsgs, CRoutingAlphMgr *pMgr);
+ protected:
+ //override...
+ virtual void Train(CAlphabetMap::SymbolStream &syms);
+ private:
+ CRoutingAlphMgr * const m_pMgr;
+ ///Symbol # of the start-of-annotation, or 0 if out-of-alphabet
+ int m_iStartSym;
+ symbol getRoute(bool bHaveRoute, const string &strRoute, symbol baseSym);
+ };
+ };
+ /// @}
+
+}
+
+#endif
diff --git a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
index e143703..9c368c9 100755
--- a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
+++ b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
@@ -394,6 +394,10 @@
E7B0BE301491E2D6003EFD33 /* alphabet.spyTonesNew.xml in Resources */ = {isa = PBXBuildFile; fileRef = E7B0BE2C1491E2D6003EFD33 /* alphabet.spyTonesNew.xml */; };
E7B0BE321491E305003EFD33 /* training_spyNew.txt in Resources */ = {isa = PBXBuildFile; fileRef = E7B0BE311491E305003EFD33 /* training_spyNew.txt */; };
E7C68E691430824D00440B5B /* Messages.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7C68E681430824D00440B5B /* Messages.cpp */; };
+ E7DED58F1497599B005DE19D /* RoutingPPMLanguageModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */; };
+ E7DED592149759AF005DE19D /* RoutingPPMLanguageModel.h in Headers */ = {isa = PBXBuildFile; fileRef = E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */; };
+ E7DED59414976BC0005DE19D /* RoutingAlphMgr.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */; };
+ E7DED59614976BD3005DE19D /* RoutingAlphMgr.h in Headers */ = {isa = PBXBuildFile; fileRef = E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@@ -802,6 +806,10 @@
E7B0BE2C1491E2D6003EFD33 /* alphabet.spyTonesNew.xml */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; path = alphabet.spyTonesNew.xml; sourceTree = "<group>"; };
E7B0BE311491E305003EFD33 /* training_spyNew.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = training_spyNew.txt; sourceTree = "<group>"; };
E7C68E681430824D00440B5B /* Messages.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Messages.cpp; sourceTree = "<group>"; };
+ E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RoutingPPMLanguageModel.cpp; sourceTree = "<group>"; };
+ E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RoutingPPMLanguageModel.h; sourceTree = "<group>"; };
+ E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RoutingAlphMgr.cpp; sourceTree = "<group>"; };
+ E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RoutingAlphMgr.h; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -1030,6 +1038,8 @@
1948BE9B0C226CFD001DFA32 /* XMLUtil.h */,
3300835E120CB7F900C41FAA /* ConvertingAlphMgr.h */,
3300835F120CB7F900C41FAA /* ConvertingAlphMgr.cpp */,
+ E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */,
+ E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */,
);
name = DasherCore;
path = ../DasherCore;
@@ -1073,6 +1083,8 @@
1948BE630C226CFD001DFA32 /* PPMLanguageModel.h */,
1948BE650C226CFD001DFA32 /* WordLanguageModel.cpp */,
1948BE660C226CFD001DFA32 /* WordLanguageModel.h */,
+ E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */,
+ E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */,
);
path = LanguageModelling;
sourceTree = "<group>";
@@ -1474,6 +1486,8 @@
33DDB9E113B8AF360001C52D /* DynamicButtons.h in Headers */,
333B409512088AFA00235721 /* DemoFilter.h in Headers */,
E7641878142A48C70031FC91 /* Globber.h in Headers */,
+ E7DED592149759AF005DE19D /* RoutingPPMLanguageModel.h in Headers */,
+ E7DED59614976BD3005DE19D /* RoutingAlphMgr.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@@ -1832,6 +1846,8 @@
333B409412088AFA00235721 /* DemoFilter.cpp in Sources */,
E7641875142A48AD0031FC91 /* Globber.cpp in Sources */,
E7C68E691430824D00440B5B /* Messages.cpp in Sources */,
+ E7DED58F1497599B005DE19D /* RoutingPPMLanguageModel.cpp in Sources */,
+ E7DED59414976BC0005DE19D /* RoutingAlphMgr.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]