[dasher: 24/38] New alternative to Mandarin/PPMPY: RoutingAlphMgr & RoutingPPMLanguageModel



commit b67ddca45df9876e0abd105f8832a27e60e038f4
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Sun Dec 18 22:11:40 2011 +0000

    New alternative to Mandarin/PPMPY: RoutingAlphMgr & RoutingPPMLanguageModel
    
    Attempts to learn likely upcoming symbols, independently from how you like to
     enter said symbols => should deal better with unannotated training texts, and
     learn correlations better. (Still no game mode!)
    
    Used for conversionid=3 or 4: assumes the route by which you write a symbol,
     is context-independent in the former, context-dependent in the latter.

 Src/DasherCore/Alphabet/AlphInfo.h                 |    2 +-
 Src/DasherCore/LanguageModelling/Makefile.am       |    2 +
 .../LanguageModelling/RoutingPPMLanguageModel.cpp  |  205 ++++++++++++++++++++
 .../LanguageModelling/RoutingPPMLanguageModel.h    |   93 +++++++++
 Src/DasherCore/Makefile.am                         |    2 +
 Src/DasherCore/NodeCreationManager.cpp             |    9 +
 Src/DasherCore/RoutingAlphMgr.cpp                  |  193 ++++++++++++++++++
 Src/DasherCore/RoutingAlphMgr.h                    |  117 +++++++++++
 Src/MacOSX/Dasher.xcodeproj/project.pbxproj        |   16 ++
 9 files changed, 638 insertions(+), 1 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 9b5d9fe..12d3885 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -121,7 +121,7 @@ public:
   ///Single-unicode characters used in the training file to delimit the name of a group
   /// containing the next symbol, in order to disambiguate which group (=route, pronunciation)
   /// was used to produce the symbol in this case (see MandarinTrainer).
-  /// Only used if m_iConversionID==2. Default to "<" and ">"
+  /// Only used if m_iConversionID==2, 3 or 4. Default to "<" and ">"
   std::string m_strConversionTrainStart,m_strConversionTrainStop;
 
   ~CAlphInfo();
diff --git a/Src/DasherCore/LanguageModelling/Makefile.am b/Src/DasherCore/LanguageModelling/Makefile.am
index 4ac9b80..b43e9d5 100644
--- a/Src/DasherCore/LanguageModelling/Makefile.am
+++ b/Src/DasherCore/LanguageModelling/Makefile.am
@@ -13,5 +13,7 @@ libdasherlm_a_SOURCES = \
 		PPMLanguageModel.h \
 		PPMPYLanguageModel.cpp \
 		PPMPYLanguageModel.h \
+		RoutingPPMLanguageModel.cpp \
+		RoutingPPMLanguageModel.h \
 		WordLanguageModel.cpp \
 		WordLanguageModel.h
diff --git a/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp
new file mode 100644
index 0000000..0737869
--- /dev/null
+++ b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.cpp
@@ -0,0 +1,205 @@
+//
+//  RoutingPPMLanguageModel.cpp
+//  Dasher
+//
+//  Created by Alan Lawrence on 13/12/11.
+//  Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#include "RoutingPPMLanguageModel.h"
+
+using namespace Dasher;
+using namespace std;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+/////////////////////////////////////////////////////////////////////
+
+CRoutingPPMLanguageModel::CRoutingPPMLanguageModel(CSettingsUser *pCreator, const vector<symbol> *pBaseSyms, const vector<set<symbol> > *pRoutes, bool bRoutesContextSensitive)
+:CAbstractPPM(pCreator, pRoutes->size()-1, new CRoutingPPMnode(-1), GetLongParameter(LP_LM_MAX_ORDER)), NodesAllocated(0), m_NodeAlloc(8192), m_pBaseSyms(pBaseSyms), m_pRoutes(pRoutes), m_bRoutesContextSensitive(bRoutesContextSensitive) {
+  DASHER_ASSERT(pBaseSyms->size() >= pRoutes->size());
+}
+
+void CRoutingPPMLanguageModel::GetProbs(Context context, std::vector<unsigned int> &probs, int norm, int iUniform) const {
+  const CPPMContext *ppmcontext = (const CPPMContext *)(context);
+
+  const int iNumSymbols(m_pBaseSyms->size()); //i.e., the #routes - so loop from i=1 to <iNumSymbols
+  probs.resize(iNumSymbols);
+  
+  unsigned int iToSpend = norm;
+  unsigned int iUniformLeft = iUniform;
+  
+  // TODO: Sort out zero symbol case
+  probs[0] = 0;
+  
+  for(int i = 1; i < iNumSymbols; i++) {
+    probs[i] = iUniformLeft / (iNumSymbols - i);
+    iUniformLeft -= probs[i];
+    iToSpend -= probs[i];
+  }
+  
+  DASHER_ASSERT(iUniformLeft == 0);
+  
+  int alpha = GetLongParameter( LP_LM_ALPHA );
+  int beta = GetLongParameter( LP_LM_BETA );
+
+  //first, fill out the probabilities of the base symbols, as per ordinary PPM
+  // (TODO, could move CPPMLanguageModel::GetProbs into CAbstractPPM, would do
+  // this for us?)
+  vector<unsigned int> baseProbs(GetSize()); //i.e. # base symbols
+  for (CPPMnode *pTemp = ppmcontext->head; pTemp; pTemp = pTemp->vine) {
+    int iTotal = 0;
+    for (ChildIterator it=pTemp->children(); it!=pTemp->end(); it++)
+      iTotal += (*it)->count;
+    
+    if(iTotal) {
+      unsigned int size_of_slice = iToSpend;
+      
+      for (ChildIterator it=pTemp->children(); it!=pTemp->end(); it++) {
+        unsigned int p = static_cast < myint > (size_of_slice) * (100 * (*it)->count - beta) / (100 * iTotal + alpha);
+          
+        baseProbs[(*it)->sym] += p;
+        iToSpend -= p;
+      
+        //                              Usprintf(debug,TEXT("sym %u counts %d p %u tospend %u \n"),sym,s->count,p,tospend);      
+        //                              DebugOutput( debug);
+      }
+    } 
+  }
+  
+  //anything left over, distribute evenly...
+  for (int i=1,iLeft=GetSize()-1; i<GetSize(); i++,iLeft--) {
+    unsigned int p = iToSpend / iLeft;
+    baseProbs[i] += p;
+    iToSpend -= p;
+  }
+  DASHER_ASSERT(iToSpend == 0);
+  
+  //second, use those figures as the _total_ to divide up between the routes
+  // _for_each_base_symbol_.
+  for (CPPMnode *pTemp = ppmcontext->head; pTemp; pTemp=pTemp->vine) {
+    if (pTemp!=m_pRoot && !m_bRoutesContextSensitive) continue;
+
+    for (ChildIterator it = pTemp->children(); it!=pTemp->end(); it++) {
+      const CRoutingPPMnode *pNode(static_cast<CRoutingPPMnode*>(*it));
+      int iTotal=0; //total for base symbol corresponding to child (at this level of PPM tree)
+      for (map<symbol,unsigned short int>::const_iterator it2=pNode->m_routes.begin(); it2!=pNode->m_routes.end(); it2++)
+        iTotal += it2->second;
+      if (iTotal) {
+        //divvy up some of baseProbs according to the distribution
+        // of pNode->m_routes
+        unsigned int size_of_slice = baseProbs[pNode->sym];
+        for (map<symbol,unsigned short int>::const_iterator it2=pNode->m_routes.begin(); it2!=pNode->m_routes.end(); it2++) {
+          unsigned int p = size_of_slice * (100 * it2->second - beta) / (100*iTotal + alpha);
+          probs[it2->first] += p;
+          baseProbs[pNode->sym] -= p;
+        }
+      }
+    }
+  }
+  
+  //for each base, distribute any remaining probability mass
+  // uniformly to all the routes to that base.
+  for (int i=1; i<GetSize(); i++) {
+    if (!baseProbs[i]) continue; //=already distributed
+    
+    //ok, so there's some probability mass assigned to the base symbol,
+    // which we haven't assigned to any route
+    const set<symbol> &routes((*m_pRoutes)[i]);
+    //divide it up evenly
+    int iLeft = routes.size();
+    for (set<symbol>::iterator it = routes.begin(); it!=routes.end(); it++) {
+      unsigned int p = baseProbs[i] / iLeft;
+      probs[*it] += p;
+      baseProbs[i] -= p;
+      --iLeft;
+    }
+    DASHER_ASSERT(baseProbs[i]==0);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////
+
+symbol CRoutingPPMLanguageModel::GetBestRoute(Context ctx) {
+  const CPPMContext *context = (const CPPMContext *)ctx;
+  DASHER_ASSERT(context->head && context->head != m_pRoot);
+  
+  map<symbol,unsigned int> probs; //of the routes leading to this base sym
+  int iToSpend = 1<<16; //arbitrary, could be anything
+  int alpha = GetLongParameter(LP_LM_ALPHA), beta=GetLongParameter(LP_LM_BETA);
+  
+  for (CPPMnode *pTemp = context->head; pTemp!=m_pRoot; pTemp=pTemp->vine) {
+    if (pTemp->vine!=m_pRoot && !m_bRoutesContextSensitive) continue;
+
+    const CRoutingPPMnode *node(static_cast<CRoutingPPMnode*>(pTemp));
+    unsigned long iTotal=0;
+    for (map<symbol,unsigned short int>::const_iterator it=node->m_routes.begin(); it!=node->m_routes.end(); it++)
+      iTotal += it->second;
+    if (!iTotal) continue;
+    const int size_of_slice(iToSpend);
+    for (map<symbol,unsigned short int>::const_iterator it=node->m_routes.begin(); it!=node->m_routes.end(); it++) {
+      unsigned int p = size_of_slice * (100*it->second - beta) / (100*iTotal+ alpha);
+      iToSpend-=p;
+      probs[it->first]+=p;
+    }
+  }
+  //Could divvy up rest uniformly...but there's no point, this won't affect
+  // which is most likely! (Except by rounding error, i.e. if iToSpend can't
+  // be divided evenly between the routes. But let's not worry about that,
+  // the worst that can happen is the user ends up in a different, very-nearly-equally-sized,
+  // box)
+  
+  pair<symbol,unsigned int> best;//initially (0,0)
+  for (map<symbol, unsigned int>::iterator it=probs.begin(); it!=probs.end(); it++) {
+    DASHER_ASSERT((*m_pRoutes)[context->head->sym].count(it->first));
+    if (it->second>best.second) best=*it;
+  }
+  
+  if (best.second) return best.first;
+  //no data. pick one at random
+  const set<symbol> &options((*m_pRoutes)[context->head->sym]);
+  //in fact, (very pseudo)-random:
+  return *(options.begin());
+}
+
+void CRoutingPPMLanguageModel::LearnBaseSymbol(Context c, int baseSym) {
+  CAbstractPPM::LearnSymbol(c, baseSym);
+}
+
+void CRoutingPPMLanguageModel::LearnSymbol(Context ctx, int sym) {
+  int base = m_pBaseSyms->at(sym);
+  LearnBaseSymbol(ctx, base);
+  //ctx now updated, points to node for learnt base sym
+  DASHER_ASSERT((*m_pRoutes)[base].size());
+  if ((*m_pRoutes)[base].size()==1) return; //no need to store, saves computation if we don't
+  for (CPPMnode *node=((CPPMContext*)ctx)->head; node!=m_pRoot; node=node->vine) {
+    if (node->vine!=m_pRoot && !m_bRoutesContextSensitive) continue;
+    else if (static_cast<CRoutingPPMnode*>(node)->m_routes[sym]++) //returns old value, i.e. 0 if not present
+      if (bUpdateExclusion) break;
+  }
+}
+
+CRoutingPPMLanguageModel::CRoutingPPMnode *CRoutingPPMLanguageModel::makeNode(int sym) {
+  CRoutingPPMnode *res = m_NodeAlloc.Alloc();
+  res->sym=sym;
+  ++NodesAllocated;
+  return res;
+}
+
+//Mandarin - PY not enabled for these read-write functions
+bool CRoutingPPMLanguageModel::WriteToFile(std::string strFilename) {
+  return false;
+}
+
+//Mandarin - PY not enabled for these read-write functions
+bool CRoutingPPMLanguageModel::ReadFromFile(std::string strFilename) {
+  return false;
+}
diff --git a/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h
new file mode 100644
index 0000000..ffe1c62
--- /dev/null
+++ b/Src/DasherCore/LanguageModelling/RoutingPPMLanguageModel.h
@@ -0,0 +1,93 @@
+//
+//  RoutingPPMLanguageModel.h
+//  Dasher
+//
+//  Created by Alan Lawrence on 13/12/11.
+//  Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#ifndef __RoutingPPMLanguageModel_h__
+#define __RoutingPPMLanguageModel_h__
+
+#include "PPMLanguageModel.h"
+
+#include <set>
+
+namespace Dasher {
+  
+  ///
+  /// \ingroup LM
+  /// @{
+  
+  ///
+  /// Routing Language Model: tries to independently learn a sequence of 'base'
+  /// symbols, as per PPM, and also which of multiple routes are used to enter them;
+  /// predicts probabilities for (base*route) by dividing probability for base sym
+  /// (as per first part of model) up between its possible routes according to
+  /// second part of model.
+  ///
+  /// All contexts are base syms only, so extends PPM over base sym; but overrides
+  /// GetProbs to return larger array of probs over (base*route), also LearnSymbol
+  /// (to learn base*route; if only base is available, call LearnBaseSymbol).
+  /// EnterSymbol (which doesn't do learning) takes base symbols (for context) only.
+  class CRoutingPPMLanguageModel : public CAbstractPPM {
+  public:
+    /// \param pBaseSyms vector identifying the base symbol for each (base+route).
+    ///        Thus, size indicates the number of (base+route)s.
+    /// \param pRoutes vector identifying all possible route#s for each base sym
+    ///        Thus, size indicates the number of base syms.
+    /// \param bRoutesContextSensitive if false, the distribution over (routes by
+    ///        which a given symbol is entered) is considered independent of context;
+    ///        if true, likely routes are learnt according to the preceding context.
+    CRoutingPPMLanguageModel(CSettingsUser *pCreator, const std::vector<symbol> *pBaseSyms, const std::vector<std::set<symbol> > *pRoutes, bool bRoutesContextSensitive);
+    
+    /// Learns a base symbol (but not which route we are likely to enter it by).
+    /// Includes moving on the context to include that base sym.
+    void LearnBaseSymbol(Context context, int Symbol);
+    
+    /// Learns a base+route, including moving the context on to include the base.
+    void LearnSymbol(Context context, int Symbol);
+
+    ///Note we can only ever enter base symbols.
+
+    ///Returns the most likely route by which a symbol might have been entered
+    /// \param ctx context whose most-recent character identifies the base symbol
+    ///        in which we are interested.
+    symbol GetBestRoute(Context ctx);
+        
+    ///Predicts probabilities for all (base*route)s.
+    /// \param Probs vector to fill with predictions; will be filled m_pBaseSyms->size()
+    ///  elements (including initial 0)
+    virtual void GetProbs(Context context, std::vector < unsigned int >&Probs, int norm, int iUniform) const;
+
+    ///disable file i/o
+    virtual bool WriteToFile(std::string strFilename);
+    virtual bool ReadFromFile(std::string strFilename);
+    
+  protected:
+    ///Subclass to additionally store counts of route by which this context (i.e.
+    /// the last base symbol within) was entered, when we know that.
+    class CRoutingPPMnode : public CPPMnode {
+    public:
+      ///map from route (to the last base sym only) to count by which that route
+      /// was definitely used.
+      std::map<symbol,unsigned short int> m_routes;
+      inline CRoutingPPMnode(int sym) : CPPMnode(sym) {}
+      inline CRoutingPPMnode() : CPPMnode() {}
+    };
+    ///Always returns a CRoutingPPMnode. TODO, work through class and use standard
+    /// map-less PPMnodes for unambiguous base syms (which have only one route) ?
+    CRoutingPPMnode *makeNode(int sym);
+    
+  private:
+    int NodesAllocated;
+    CSimplePooledAlloc < CRoutingPPMnode > m_NodeAlloc;
+    const std::vector<symbol> *m_pBaseSyms;
+    const std::vector<std::set<symbol> > *m_pRoutes;
+    const bool m_bRoutesContextSensitive;
+  };
+  
+  /// @}  
+}                               // end namespace Dasher
+
+#endif
diff --git a/Src/DasherCore/Makefile.am b/Src/DasherCore/Makefile.am
index d47533e..dfa344e 100644
--- a/Src/DasherCore/Makefile.am
+++ b/Src/DasherCore/Makefile.am
@@ -105,6 +105,8 @@ libdashercore_a_SOURCES = \
 		OneButtonFilter.h \
 		OneDimensionalFilter.cpp \
 		OneDimensionalFilter.h \
+		RoutingAlphMgr.cpp \
+		RoutingAlphMgr.h \
 		SCENode.cpp \
 		SCENode.h \
 		ScreenGameModule.cpp \
diff --git a/Src/DasherCore/NodeCreationManager.cpp b/Src/DasherCore/NodeCreationManager.cpp
index f1c716a..26c333d 100644
--- a/Src/DasherCore/NodeCreationManager.cpp
+++ b/Src/DasherCore/NodeCreationManager.cpp
@@ -2,6 +2,7 @@
 #include "DasherInterfaceBase.h"
 #include "NodeCreationManager.h"
 #include "MandarinAlphMgr.h"
+#include "RoutingAlphMgr.h"
 #include "ConvertingAlphMgr.h"
 #include "ControlManager.h"
 #include "Observable.h"
@@ -78,6 +79,14 @@ CNodeCreationManager::CNodeCreationManager(CSettingsUser *pCreateFrom,
       //(ACL) Modify AlphabetManager for Mandarin Dasher
       m_pAlphabetManager = new CMandarinAlphMgr(this, pInterface, this, pAlphInfo);
       break;
+    case 3: //these differ only in that conversion id 3 assumes the route by which
+    case 4: //the user writes a symbol, is not dependent on context (e.g. just user preference),
+            //whereas 4 assumes it does depend on context (e.g. phonetic chinese)
+      m_pAlphabetManager = new CRoutingAlphMgr(this, pInterface, this, pAlphInfo);
+      break;
+      //TODO: we could even just switch from standard alphmgr, to case 3, automatically
+      // if the alphabet has repeated symbols; and thus do away with much of the "conversionid"
+      // tag (just a flag for context-sensitivity, and maybe the start/stop delimiters?)
   }
   //all other configuration changes, etc., that might be necessary for a particular conversion mode,
   // are implemented by AlphabetManager subclasses overriding the following two methods:
diff --git a/Src/DasherCore/RoutingAlphMgr.cpp b/Src/DasherCore/RoutingAlphMgr.cpp
new file mode 100644
index 0000000..204ae7a
--- /dev/null
+++ b/Src/DasherCore/RoutingAlphMgr.cpp
@@ -0,0 +1,193 @@
+//
+//  RoutingAlphMgr.cpp
+//  Dasher
+//
+//  Created by Alan Lawrence on 13/12/11.
+//  Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#include "RoutingAlphMgr.h"
+#include "DasherInterfaceBase.h"
+using namespace std;
+using namespace Dasher;
+
+// Track memory leaks on Windows to the line that new'd the memory
+#ifdef _WIN32
+#ifdef _DEBUG_MEMLEAKS
+#define DEBUG_NEW new( _NORMAL_BLOCK, THIS_FILE, __LINE__ )
+#define new DEBUG_NEW
+#undef THIS_FILE
+static char THIS_FILE[] = __FILE__;
+#endif
+#endif
+
+CRoutingAlphMgr::CRoutingAlphMgr(CSettingsUser *pCreator, CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet)
+: CAlphabetManager(pCreator, pInterface, pNCManager, pAlphabet) {
+  
+  DASHER_ASSERT(pAlphabet->m_iConversionID==3 || pAlphabet->m_iConversionID==4);
+}
+
+void CRoutingAlphMgr::InitMap() {
+  m_vBaseSyms.reserve(m_pAlphabet->iEnd); m_vBaseSyms.push_back(0); //base for unknown route = unknown!
+  m_vRoutes.push_back(set<symbol>()); //unknown base symbol has no routes
+  for (int i=1; i<m_pAlphabet->iEnd; i++) {
+    symbol s = m_map.Get(m_pAlphabet->GetText(i));
+    if (s==0) {
+      s=m_vRoutes.size();
+      m_vRoutes.push_back(set<symbol>());
+      m_map.Add(m_pAlphabet->GetText(i),s);
+    }
+    m_vBaseSyms.push_back(s);
+    m_vRoutes[s].insert(i);
+  }
+  m_vGroupsByRoute.resize(m_vBaseSyms.size());
+  vector<const SGroupInfo *> vGroups;
+  DASHER_ASSERT(!m_pAlphabet->pNext);
+  vGroups.push_back(m_pAlphabet->pChild);
+  while (!vGroups.empty()) {
+    const SGroupInfo *g(vGroups.back()); vGroups.pop_back();
+    if (!g) continue;
+    for (int i=g->iStart; i<g->iEnd; i++) m_vGroupsByRoute[i]=g;
+    vGroups.push_back(g->pNext);
+    vGroups.push_back(g->pChild);
+  }
+}
+
+void CRoutingAlphMgr::CreateLanguageModel() {
+  m_pLanguageModel = new CRoutingPPMLanguageModel(this, &m_vBaseSyms, &m_vRoutes, m_pAlphabet->m_iConversionID==4);
+}
+
+string CRoutingAlphMgr::CRoutedSym::trainText() {
+  const set<symbol> &routes(mgr()->m_vRoutes[mgr()->m_vBaseSyms[iSymbol]]);
+  DASHER_ASSERT(routes.count(iSymbol));
+  string t=CSymbolNode::trainText();
+  if (routes.size()!=1)
+    if (const SGroupInfo *g = mgr()->m_vGroupsByRoute[iSymbol])
+      return mgr()->m_pAlphabet->m_strConversionTrainStart + g->strName + mgr()->m_pAlphabet->m_strConversionTrainStop + t;
+  return t;
+}
+
+CRoutingAlphMgr::CRoutedSym::CRoutedSym(int iOffset, CDasherScreen::Label *pLabel, CRoutingAlphMgr *pMgr, symbol iSymbol)
+: CSymbolNode(iOffset, pLabel, pMgr, iSymbol) {
+};
+
+
+CAlphabetManager::CAlphNode *CRoutingAlphMgr::CreateSymbolRoot(int iOffset, CLanguageModel::Context ctx, symbol sym) {
+  //sym is from the map, so a base symbol. It's at the end of the context,
+  // TODO unless this is the completely-empty context,
+  // so ask the LM for which way it's most likely to have been entered
+  sym = static_cast<CRoutingPPMLanguageModel*>(m_pLanguageModel)->GetBestRoute(ctx);
+  return new CRoutedSym(iOffset, m_vLabels[sym], this, sym);
+}
+
+int CRoutingAlphMgr::GetColour(symbol route, int iOffset) const {
+  int iColour = m_pAlphabet->GetColour(route); //colours were rehashed with CH symbol text
+  if (iColour==-1) {
+    //none specified in alphabet
+    static int colourStore[2][3] = {
+      {66,//light blue
+        64,//very light green
+        62},//light yellow
+      {78,//light purple
+        81,//brownish
+        60},//red
+    };    
+    return colourStore[iOffset&1][route % 3];
+  }
+  if ((iOffset&1)==0 && iColour<130) iColour+=130;
+  return iColour;
+}
+
+
+CDasherNode *CRoutingAlphMgr::CreateSymbolNode(CAlphNode *pParent, symbol iSymbol) {
+
+  int iNewOffset = pParent->offset()+1;
+  if (m_pAlphabet->GetText(iSymbol)=="\r\n") iNewOffset++;
+  CSymbolNode *pAlphNode = new CRoutedSym(iNewOffset, m_vLabels[iSymbol], this, iSymbol);
+  
+  pAlphNode->iContext = m_pLanguageModel->CloneContext(pParent->iContext);
+  
+  //namely, we want to enter only the BASE symbol into the LM, not the route
+  // (which would be out of range):
+  m_pLanguageModel->EnterSymbol(pAlphNode->iContext, m_vBaseSyms[iSymbol]);
+  // (Unfortunately, we can't make EnterSymbol take route numbers, because
+  // it has base symbols passed to it from the alphabet map)
+  return pAlphNode;
+
+}
+
+CRoutingAlphMgr::CRoutingTrainer::CRoutingTrainer(CMessageDisplay *pMsgs, CRoutingAlphMgr *pMgr)
+: CTrainer(pMsgs, pMgr->m_pLanguageModel, pMgr->m_pAlphabet, &pMgr->m_map), m_pMgr(pMgr) {
+  
+  m_iStartSym=0;  
+  vector<symbol> trainStartSyms;
+  m_pAlphabet->GetSymbols(trainStartSyms, m_pInfo->m_strConversionTrainStart);
+  if (trainStartSyms.size()==1)
+    m_iStartSym = trainStartSyms[0];
+  else
+    m_pMsgs->FormatMessageWithString(_("Warning: faulty alphabet definition: training-start delimiter %s must be a single unicode character. May be unable to process training file."),
+                                     m_pInfo->m_strConversionTrainStart.c_str());
+}
+
+symbol CRoutingAlphMgr::CRoutingTrainer::getRoute(bool bHaveRoute, const string &strRoute, symbol baseSym) {  
+  const set<symbol> &candidates(m_pMgr->m_vRoutes.at(baseSym));
+  set<symbol> named;
+  for (set<symbol>::iterator it=candidates.begin(); it!=candidates.end(); it++)
+    if (const SGroupInfo *g=m_pMgr->m_vGroupsByRoute[*it])
+      if (g->strName == strRoute)
+        named.insert(*it);
+  //if no name was given, but a single group with no name exists, use it!
+  if (named.size()==1) return *(named.begin());
+  //otherwise, we will not learn a route - but this is fine, we can learn
+  // that later more-or-less independently
+  
+  if (bHaveRoute) {
+    m_pMsgs->FormatMessageWith2Strings((named.size()==0)
+                                       ? _("Warning: training file contains character '%s' as member of group '%s', but no group of that name contains the character. Ignoring group specifier.")
+                                       : _("Warning: training file contains character '%s' as member of group '%s', but alphabet contains several such groups. Dasher will not be able to learn how you want to write this character."),
+                                         m_pInfo->GetDisplayText(baseSym).c_str(),
+                                         strRoute.c_str());
+  }
+  // don't flag a problem if no route specified
+  
+  return 0;
+}
+
+void CRoutingAlphMgr::CRoutingTrainer::Train(CAlphabetMap::SymbolStream &syms) {
+  CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
+  
+  string strRoute; bool bHaveRoute(false);
+  for (symbol sym; (sym=syms.next(m_pAlphabet))!=-1;) {
+    if (sym == m_iStartSym) {
+      if (sym!=0 || syms.peekBack()==m_pInfo->m_strConversionTrainStart) {
+        if (bHaveRoute)
+          m_pMsgs->FormatMessageWithString(_("Warning: in training file, annotation '<%s>' is followed by another annotation and will be ignored"),
+                                           strRoute.c_str());
+        strRoute.clear(); bHaveRoute=true;
+        for (string s; (s=syms.peekAhead()).length(); strRoute+=s) {
+          syms.next(m_pAlphabet);
+          if (s==m_pInfo->m_strConversionTrainStop) break;
+        }
+        continue; //read next, hopefully a CH (!)
+      } //else, unknown symbol, but does not match pinyin delimiter; fallthrough
+    }
+    if (readEscape(trainContext, sym, syms)) continue; //TODO warn if py lost?
+                                                       //OK, sym is a (CH) symbol to learn.
+    if (sym) {
+      if (symbol route = getRoute(bHaveRoute, strRoute, sym))
+        m_pLanguageModel->LearnSymbol(trainContext, route);
+      else
+        static_cast<CRoutingPPMLanguageModel*>(m_pLanguageModel)->LearnBaseSymbol(trainContext, sym);
+    } //else, silently drop - as standard CTrainer
+    bHaveRoute=false; strRoute.clear();
+  }
+  m_pLanguageModel->ReleaseContext(trainContext);
+}
+
+
+CTrainer *CRoutingAlphMgr::GetTrainer() {
+  //We pass in the pinyin alphabet to define the context-switch escape character, and the default context.
+  // Although the default context will be symbolified via the _chinese_ alphabet, this seems reasonable
+  // as it is the Pinyin alphabet which defines the conversion mapping (i.e. m_strConversionTarget!)
+  return new CRoutingTrainer(m_pInterface, this);
+}
diff --git a/Src/DasherCore/RoutingAlphMgr.h b/Src/DasherCore/RoutingAlphMgr.h
new file mode 100644
index 0000000..b36f65b
--- /dev/null
+++ b/Src/DasherCore/RoutingAlphMgr.h
@@ -0,0 +1,117 @@
+//
+//  RoutingAlphMgr.h
+//  Dasher
+//
+//  Created by Alan Lawrence on 13/12/11.
+//  Copyright 2011 Cambridge University. All rights reserved.
+//
+
+#ifndef __RoutingAlphMgr_h__
+#define __RoutingAlphMgr_h__
+
+#include "../Common/Common.h"
+
+#include "AlphabetManager.h"
+#include "LanguageModelling/RoutingPPMLanguageModel.h"
+
+namespace Dasher {
+  
+  class CDasherInterfaceBase;
+  
+  /// \ingroup Model
+  /// @{
+  
+  ///An AlphabetManager that works with alphabets containing duplicate symbols;
+  /// hence, an alternative to MandarinAlphMgr. Uses a RoutingPPMLanguageModel,
+  /// to separately learn both the output symbols and the ways the user wishes
+  /// to write them.
+  /// The alphabet + group structure is presented to the user exactly as per
+  /// alphabet definition, including repeated symbols, sized as per the LM; groups
+  /// play no part in modelling, just being sized to fit around their contents,
+  /// as per standard Dasher. However, just as in MandarinAlphMgr, the names of
+  /// the groups are used in training files to disambiguate which route was used
+  /// to enter a symbol - see nested class CRoutingTrainer. (The aim is that both
+  /// this and CMandarinAlphMgr can be used with the same training files.)
+  ///
+  /// Note we use the term 'base' or 'base symbol' to indicate a particular character
+  /// appearing in the output; this may appear multiple times in the alphabet, in
+  /// which case each occurrence is called a 'route'.
+  ///
+  /// This class is used for alphabets with conversionid 3 or 4; the former differs
+  /// in treating the route by which the user likes to enter a particular base symbol,
+  /// as not dependent on context.
+  class CRoutingAlphMgr : public CAlphabetManager {
+  public:
+    /// Create a RoutingAlphMgr! Changes are in InitMap() and CreateLanguageModel()...
+    CRoutingAlphMgr(CSettingsUser *pCreator, CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet);
+    
+    ///Override to return a CRoutingTrainer
+    CTrainer *GetTrainer();
+    
+    ///Disable game mode. The target sentence might appear in several places...!!
+    CWordGeneratorBase *GetGameWords() {return NULL;}
+
+  protected:
+    ///Fills map w/ rehashed base symbols, filling m_vBaseSyms, m_vRoutes,
+    /// and m_vGroupsByRoute to record which symbols were identified together.
+    void InitMap();
+    ///Override to create a RoutingPPMLanguageModel
+    void CreateLanguageModel();
+
+    ///Creates a symbol, i.e. including route.
+    /// Both ctx and sym were reconstructed from m_map (filled by InitMap), so
+    /// are in terms of hashed base symbols; thus, this method identifies the best
+    /// route by which that base may have been entered, and creates a symbol node
+    /// for that.
+    CAlphNode *CreateSymbolRoot(int iOffset, CLanguageModel::Context ctx, symbol sym);
+    
+    /// Override to create a CRoutedSym and enter only base sym into the LM
+    ///\param iSymbol symbol number from the alphabet defn, i.e. identifies both
+    /// base symbol and route
+    virtual CDasherNode *CreateSymbolNode(CAlphNode *pParent, symbol iSymbol);
+
+    ///Subclass to override trainText
+    class CRoutedSym : public CSymbolNode {
+    public:
+      string trainText();
+      CRoutedSym(int iOffset, CDasherScreen::Label *pLabel, CRoutingAlphMgr *pMgr, symbol iSymbol);
+    protected:
+      CRoutingAlphMgr *mgr() const {return static_cast<CRoutingAlphMgr*>(m_pMgr);}
+    };
+    ///Override to provide different defaults! Otherwise as GetColour,
+    /// this uses the character data in the alphabet anyway.
+    int GetColour(symbol CHsym, int iOffset) const;
+  private:
+    ///for each (not necessarily unique) symbol in the alphabet, the id of the unique base symbol with that text
+    std::vector<symbol> m_vBaseSyms;
+    ///for each base symbol, the symbol#'s of all syms-with-routes with that text
+    std::vector<std::set<symbol> > m_vRoutes;
+    ///closest containing group for each route
+    std::vector<const SGroupInfo*> m_vGroupsByRoute;
+    
+    /// Trains a RoutingPPMLanguageModel. Just as for MandarinAlphMgr/PPMPY, the
+    /// training file is expected to consist of a sequence of CH syms (+ context
+    /// switch commands), where CH syms may be preceded by annotations <py> 
+    /// (angle brackets are the default delimiters, alternatives may be provided
+    /// in the start & stop attributes of the alphabet conversionid tag). The PY
+    /// should identify exactly one group containing the following CH symbol (or
+    /// will be ignored, but the LM handles ambiguous base symbols where no route
+    /// is specified, somewhat better than PPMPY).
+    class CRoutingTrainer : public CTrainer {
+    public:
+      CRoutingTrainer(CMessageDisplay *pMsgs, CRoutingAlphMgr *pMgr);
+    protected:
+      //override...
+      virtual void Train(CAlphabetMap::SymbolStream &syms);
+    private:
+      CRoutingAlphMgr * const m_pMgr;
+      ///Symbol # of the start-of-annotation, or 0 if out-of-alphabet
+      int m_iStartSym;
+      symbol getRoute(bool bHaveRoute, const string &strRoute, symbol baseSym);
+    };
+  };
+  /// @}
+  
+}
+
+#endif
diff --git a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
index e143703..9c368c9 100755
--- a/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
+++ b/Src/MacOSX/Dasher.xcodeproj/project.pbxproj
@@ -394,6 +394,10 @@
 		E7B0BE301491E2D6003EFD33 /* alphabet.spyTonesNew.xml in Resources */ = {isa = PBXBuildFile; fileRef = E7B0BE2C1491E2D6003EFD33 /* alphabet.spyTonesNew.xml */; };
 		E7B0BE321491E305003EFD33 /* training_spyNew.txt in Resources */ = {isa = PBXBuildFile; fileRef = E7B0BE311491E305003EFD33 /* training_spyNew.txt */; };
 		E7C68E691430824D00440B5B /* Messages.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7C68E681430824D00440B5B /* Messages.cpp */; };
+		E7DED58F1497599B005DE19D /* RoutingPPMLanguageModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */; };
+		E7DED592149759AF005DE19D /* RoutingPPMLanguageModel.h in Headers */ = {isa = PBXBuildFile; fileRef = E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */; };
+		E7DED59414976BC0005DE19D /* RoutingAlphMgr.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */; };
+		E7DED59614976BD3005DE19D /* RoutingAlphMgr.h in Headers */ = {isa = PBXBuildFile; fileRef = E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -802,6 +806,10 @@
 		E7B0BE2C1491E2D6003EFD33 /* alphabet.spyTonesNew.xml */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xml; path = alphabet.spyTonesNew.xml; sourceTree = "<group>"; };
 		E7B0BE311491E305003EFD33 /* training_spyNew.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = training_spyNew.txt; sourceTree = "<group>"; };
 		E7C68E681430824D00440B5B /* Messages.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Messages.cpp; sourceTree = "<group>"; };
+		E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RoutingPPMLanguageModel.cpp; sourceTree = "<group>"; };
+		E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RoutingPPMLanguageModel.h; sourceTree = "<group>"; };
+		E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RoutingAlphMgr.cpp; sourceTree = "<group>"; };
+		E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RoutingAlphMgr.h; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -1030,6 +1038,8 @@
 				1948BE9B0C226CFD001DFA32 /* XMLUtil.h */,
 				3300835E120CB7F900C41FAA /* ConvertingAlphMgr.h */,
 				3300835F120CB7F900C41FAA /* ConvertingAlphMgr.cpp */,
+				E7DED59314976BC0005DE19D /* RoutingAlphMgr.cpp */,
+				E7DED59514976BD3005DE19D /* RoutingAlphMgr.h */,
 			);
 			name = DasherCore;
 			path = ../DasherCore;
@@ -1073,6 +1083,8 @@
 				1948BE630C226CFD001DFA32 /* PPMLanguageModel.h */,
 				1948BE650C226CFD001DFA32 /* WordLanguageModel.cpp */,
 				1948BE660C226CFD001DFA32 /* WordLanguageModel.h */,
+				E7DED58E1497599B005DE19D /* RoutingPPMLanguageModel.cpp */,
+				E7DED591149759AE005DE19D /* RoutingPPMLanguageModel.h */,
 			);
 			path = LanguageModelling;
 			sourceTree = "<group>";
@@ -1474,6 +1486,8 @@
 				33DDB9E113B8AF360001C52D /* DynamicButtons.h in Headers */,
 				333B409512088AFA00235721 /* DemoFilter.h in Headers */,
 				E7641878142A48C70031FC91 /* Globber.h in Headers */,
+				E7DED592149759AF005DE19D /* RoutingPPMLanguageModel.h in Headers */,
+				E7DED59614976BD3005DE19D /* RoutingAlphMgr.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -1832,6 +1846,8 @@
 				333B409412088AFA00235721 /* DemoFilter.cpp in Sources */,
 				E7641875142A48AD0031FC91 /* Globber.cpp in Sources */,
 				E7C68E691430824D00440B5B /* Messages.cpp in Sources */,
+				E7DED58F1497599B005DE19D /* RoutingPPMLanguageModel.cpp in Sources */,
+				E7DED59414976BC0005DE19D /* RoutingAlphMgr.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]