[dasher] Context-switch commands via escape sequences read from training files



commit 74b18bcd2e5e0da583cd0a578eb6c031312d0b23
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Thu Mar 3 19:10:19 2011 +0000

    Context-switch commands via escape sequences read from training files
    
    Context-switch character (1 unicode) read from alphabet, defaults to §
    Trainer::readEscape parses double-escape = feed 1 escape char to LM as symbol;
      otherwise, next char is delimiter, and all chars until second occurrence
      of delimiter are entered as context only, after alphabet default ctx.
    
    MandarinTrainer passes CHAlphabet to superclass (4 ctx) and keeps PY for self;
      looks for escape chars only outside/between (delim,py,ch) triples, not inside.
    
    (Context-switch commands not yet written out into user training files!)

 Src/DasherCore/Alphabet/AlphIO.cpp    |    2 +
 Src/DasherCore/Alphabet/AlphInfo.cpp  |    1 +
 Src/DasherCore/Alphabet/AlphInfo.h    |    6 +++
 Src/DasherCore/Alphabet/AlphabetMap.h |   10 ++--
 Src/DasherCore/AlphabetManager.cpp    |    2 +-
 Src/DasherCore/MandarinAlphMgr.cpp    |    5 ++-
 Src/DasherCore/Trainer.cpp            |   67 +++++++++++++++++++++++++++++----
 Src/DasherCore/Trainer.h              |   22 +++++++++--
 8 files changed, 96 insertions(+), 19 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/AlphIO.cpp b/Src/DasherCore/Alphabet/AlphIO.cpp
index 1d96d23..17462e6 100644
--- a/Src/DasherCore/Alphabet/AlphIO.cpp
+++ b/Src/DasherCore/Alphabet/AlphIO.cpp
@@ -439,6 +439,8 @@ void CAlphIO::XML_StartElement(void *userData, const XML_Char *name, const XML_C
         atts--;
       } else if (strcmp(*atts, "hidden") == 0) {
         Me->InputInfo->m_bHidden = (strcmp(*(atts+1), "yes")==0);
+      } else if (strcmp(*atts, "escape") == 0) {
+        Me->InputInfo->m_strCtxChar = *(atts+1);
       }
       atts += 2;
     }
diff --git a/Src/DasherCore/Alphabet/AlphInfo.cpp b/Src/DasherCore/Alphabet/AlphInfo.cpp
index e9826b8..4abd0d3 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.cpp
+++ b/Src/DasherCore/Alphabet/AlphInfo.cpp
@@ -79,6 +79,7 @@ CAlphInfo::CAlphInfo() {
   iNumChildNodes = 0;
   m_iConversionID = 0;
   m_strDefaultContext = ". ";
+  m_strCtxChar = "§";
   m_bHidden=false;
 }
 
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 2d5f799..698baa4 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -106,6 +106,11 @@ public:
     
   const std::string &GetDefaultContext() const {return m_strDefaultContext;}
   
+  ///A single unicode character to use as an escape sequence in training files
+  ///to indicate context-switching commands; 0-length => don't use context-switching commands. 
+  /// Defaults to § if not specified in alphabet.
+  const std::string &GetContextEscapeChar() const {return m_strCtxChar;}
+  
   SGroupInfo *m_pBaseGroup;
   int iNumChildNodes;
   ///0 = normal alphabet, contains symbols to output
@@ -156,6 +161,7 @@ private:
   character *EndConvertCharacter;
   
   std::string m_strDefaultContext;
+  std::string m_strCtxChar;
 };
 
 
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.h b/Src/DasherCore/Alphabet/AlphabetMap.h
index b9c5807..9939673 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.h
+++ b/Src/DasherCore/Alphabet/AlphabetMap.h
@@ -90,13 +90,15 @@ public:
     symbol next(const CAlphabetMap *map);
     
     ///Finds the next complete character in the stream,  but does not advance past it.
-    /// Hence, repeated calls will return the same string.
+    /// Hence, repeated calls will return the same string. (Always constructs a string,
+    /// which next() avoids for single-octet chars, so may be slower)
     std::string peekAhead();
     
     ///Returns the string representation of the previous symbol (i.e. that returned
     /// by the previous call to next()). Undefined if next() has not been called, or
     /// if peekAhead() has been called since the last call to next(). Does not change
-    /// the stream position. Useful for debugging.
+    /// the stream position. (Always constructs a string, which next() avoids for 
+    /// single-octet chars, so may be slower.)
     std::string peekBack();
   private:
     ///Finds beginning of next unicode character, at position 'pos' or later,
@@ -113,10 +115,8 @@ public:
   
   // Fills Symbols with the symbols corresponding to Input. {{{ Note that this
   // is not necessarily reversible by repeated use of GetText. Some text
-  // may not be recognised and so discarded. }}}
-  
+  // may not be recognised; any such will be turned into symbol number 0.}}}  
   void GetSymbols(std::vector<symbol> &Symbols, const std::string &Input) const;
-  //SymbolStream *GetSymbols(std::istream &in) const;
 
   CAlphabetMap(unsigned int InitialTableSize = 255);
   void AddParagraphSymbol(symbol Value);
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index 64f4a00..7e27b1f 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -76,7 +76,7 @@ void CAlphabetManager::CreateLanguageModel(CEventHandler *pEventHandler, CSettin
 }
 
 CTrainer *CAlphabetManager::GetTrainer() {
-  return new CTrainer(m_pLanguageModel, m_pAlphabetMap);
+  return new CTrainer(m_pLanguageModel, m_pAlphabet, m_pAlphabetMap);
 }
 
 const CAlphInfo *CAlphabetManager::GetAlphabet() const {
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index f4e464f..0052da7 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -123,7 +123,10 @@ void CMandarinAlphMgr::CreateLanguageModel(CEventHandler *pEventHandler, CSettin
 }
 
 CTrainer *CMandarinAlphMgr::GetTrainer() {
-  return new CMandarinTrainer(static_cast<CPPMPYLanguageModel*>(m_pLanguageModel), m_pAlphabetMap, &m_CHAlphabetMap, m_pAlphabet->m_strConversionTrainingDelimiter);
+  //We pass in the pinyin alphabet to define the context-switch escape character, and the default context.
+  // Although the default context will be symbolified via the _chinese_ alphabet, this seems reasonable
+  // as it is the Pinyin alphabet which defines the conversion mapping (i.e. m_strConversionTarget!)
+  return new CMandarinTrainer(static_cast<CPPMPYLanguageModel*>(m_pLanguageModel), m_pAlphabet, m_pAlphabetMap, &m_CHAlphabetMap, m_pAlphabet->m_strConversionTrainingDelimiter);
 }
 
 CAlphabetManager::CAlphNode *CMandarinAlphMgr::GetRoot(CDasherNode *pParent, unsigned int iLower, unsigned int iUpper, bool bEnteredLast, int iOffset) {
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
index b34747d..a1a922f 100644
--- a/Src/DasherCore/Trainer.cpp
+++ b/Src/DasherCore/Trainer.cpp
@@ -17,21 +17,67 @@ static char THIS_FILE[] = __FILE__;
 #endif
 #endif
 
-CTrainer::CTrainer(CLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet)
-  : CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel) {
+CTrainer::CTrainer(CLanguageModel *pLanguageModel, const CAlphInfo *pInfo, const CAlphabetMap *pAlphabet)
+  : CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel), m_pInfo(pInfo) {
+    vector<symbol> syms;
+    pAlphabet->GetSymbols(syms,pInfo->GetContextEscapeChar());
+    if (syms.size()==1)
+      m_iCtxEsc = syms[0];
+    else {      
+#ifdef DEBUG
+      std::cout << "Warning: escape sequence " << pInfo->GetContextEscapeChar() << " must be a single unicode character; no context-switch commands will be executed." << std::endl;
+#endif
+      m_iCtxEsc = -1;
+    }
 }
 
 void CTrainer::Train(CAlphabetMap::SymbolStream &syms) {
   CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
 
   for(symbol sym; (sym=syms.next(m_pAlphabet))!=-1;) {
-      m_pLanguageModel->LearnSymbol(sContext, sym);
+    //check for context-switch commands.
+    // (Will only ever be triggered if m_strEscape is a single unicode character, hence warning in c'tor)
+    if (sym == m_iCtxEsc) {
+      //that was a quick check, to avoid calling slow peekBack() in most cases. Now make sure...
+      if (sym!=0 || syms.peekBack()==m_pInfo->GetContextEscapeChar()) {
+        //Yes, found escape character....
+        if (readEscape(sContext, syms)) continue;
+        //returns false, if there was a _double_ escape character - i.e. an actual
+        // occurrence of the character is wanted. In which case, fall through
+        // (sym is already set to the correct AlphabetMap symbol# for the first escape character)
+      }
+      else DASHER_ASSERT (sym==0); //symbol, and escape char, both out-of-alphabet. Fall through...
+      // (Or, TODO, should the Trainer be responsible for skipping unknown symbols, rather than the LM?)
+    }
+    //either a non-escapecharacter, or a double escapecharacter, was read
+    m_pLanguageModel->LearnSymbol(sContext, sym);
   }
   m_pLanguageModel->ReleaseContext(sContext);
 }
 
-CMandarinTrainer::CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim)
-: CTrainer(pLanguageModel, pAlphabet), m_pCHAlphabet(pCHAlphabet), m_strDelim(strDelim) {
+bool CTrainer::readEscape(CLanguageModel::Context &sContext, CAlphabetMap::SymbolStream &syms) {
+  string delim=syms.peekAhead();
+  //A double escape character means an actual occurrence of the character is wanted...
+  if (delim == m_pInfo->GetContextEscapeChar()) return false;
+  
+  //ok, so switch context. release the old, start a new...
+  m_pLanguageModel->ReleaseContext(sContext);
+  sContext = m_pLanguageModel->CreateEmptyContext();
+  //enter the alphabet default context first...
+  vector<symbol> defCtx;
+  m_pAlphabet->GetSymbols(defCtx, m_pInfo->GetDefaultContext());
+  for (vector<symbol>::iterator it=defCtx.begin(); it!=defCtx.end(); it++) m_pLanguageModel->EnterSymbol(sContext, *it);
+  //and read the first delimiter; everything until the second occurrence of this, is _context_ only.
+  syms.next(m_pAlphabet); //skip it
+  for (symbol sym; (sym=syms.next(m_pAlphabet))!=-1; ) {
+    if (syms.peekBack()==delim) break;
+    m_pLanguageModel->EnterSymbol(sContext, sym);
+  }
+  return true;  
+}
+
+CMandarinTrainer::CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphInfo *pInfo, const CAlphabetMap *pPYAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim)
+: CTrainer(pLanguageModel, pInfo, pCHAlphabet), m_pPYAlphabet(pPYAlphabet), m_strDelim(strDelim) {
 }
 
 void CMandarinTrainer::Train(CAlphabetMap::SymbolStream &syms) {
@@ -39,16 +85,16 @@ void CMandarinTrainer::Train(CAlphabetMap::SymbolStream &syms) {
   CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
   
   for (string s; (s=syms.peekAhead()).length();) {
-    syms.next(m_pAlphabet); //skip over character at which we just peeked (we don't need the symbol#)
+    syms.next(m_pPYAlphabet); //skip over character at which we just peeked (we don't need the symbol#)
     
     if (s == m_strDelim) { //found delimiter, so process next two characters
-      symbol Sympy = syms.next(m_pAlphabet);
+      symbol Sympy = syms.next(m_pPYAlphabet);
       if (Sympy==-1) break; //EOF
 #ifdef DEBUG
       if (Sympy==0)
         std::cout << "Unknown pinyin character " << syms.peekBack() << std::endl;
 #endif
-      symbol Symchar = syms.next(m_pCHAlphabet);
+      symbol Symchar = syms.next(m_pAlphabet);
       if (Symchar==-1) break; //EOF...ignore final Pinyin?
 #ifdef DEBUG
       if (Symchar==0)
@@ -57,6 +103,11 @@ void CMandarinTrainer::Train(CAlphabetMap::SymbolStream &syms) {
       static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy);
       m_pLanguageModel->LearnSymbol(trainContext, Symchar);
       numberofchar++;    
+    } else if (s == m_pInfo->GetContextEscapeChar()) {
+      //we've already skipped over the (first) escape char
+      readEscape(trainContext, syms);
+      //a double escape-char will be ignored: it means "don't switch context,
+      // here's an (escape-char)", but we are only looking for m_strDelim.
     } //else, keep looking for delimiter
   }
   m_pLanguageModel->ReleaseContext(trainContext);
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
index 407bd1f..0e87714 100644
--- a/Src/DasherCore/Trainer.h
+++ b/Src/DasherCore/Trainer.h
@@ -3,17 +3,29 @@
 
 #include "LanguageModelling/PPMPYLanguageModel.h"
 #include "TrainingHelper.h"
+#include "Alphabet/AlphInfo.h"
 
 namespace Dasher {
   class CDasherInterfaceBase;
 	
   class CTrainer : public CTrainingHelper {
   public:
-    CTrainer(CLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet);
+    CTrainer(CLanguageModel *pLanguageModel, const CAlphInfo *pInfo, const CAlphabetMap *pAlphabet);
 
   protected:
     virtual void Train(CAlphabetMap::SymbolStream &syms);
     CLanguageModel *m_pLanguageModel;
+    
+    ///Try to read a context-switch escape sequence from the symbolstream.
+    /// \param sContext context to be reinitialized if a context-switch command is found
+    /// \syms symbolstream to read, should be positioned just after the first occurrence of the escape character.
+    /// \return true if a context-switch command was found (=> sContext reinitialized);
+    ///  false, if instead a double-escape-character (=encoding of that actual symbol) was read
+    bool readEscape(CLanguageModel::Context &sContext, CAlphabetMap::SymbolStream &syms);
+    
+    const CAlphInfo *m_pInfo;
+    // symbol number in alphabet of the context-switch character (maybe 0 if not in alphabet!)
+    int m_iCtxEsc;
   };
 	
   /// Trains a PPMPYLanguageModel (dual alphabet), as for e.g. MandarinDasher.
@@ -25,17 +37,19 @@ namespace Dasher {
   class CMandarinTrainer : public CTrainer {
   public:
     /// Construct a new MandarinTrainer
-    /// \param pAlphabet mapping from text to symbol# in PY alphabet
+    /// \param pInfo used for GetContextEscapeChar and GetDefaultContext (only), both as strings
+    /// \param pPYAlphabet mapping from text to symbol# in PY alphabet
     /// \param pCHAlphabet mapping from text to symbol# (rehashed by MandarinAlphMgr) in CHAlphabet
     /// \param strDelim delimiter character (1 unicode, maybe >1 octet; if not, will never be matched)
-    CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim);
+    CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphInfo *pInfo, const CAlphabetMap *pPYAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim);
 
   protected:
     //override...
     virtual void Train(CAlphabetMap::SymbolStream &syms);
     
   private:
-    const CAlphabetMap *m_pCHAlphabet;
+    ///The pinyin alphabet (the chinese alphabet is passed into the superclass)
+    const CAlphabetMap *m_pPYAlphabet;
     ///Delimiter, as above. 
     const std::string m_strDelim;
   };



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]