[dasher] Generalize MandarinDasher: SPY alph stores alphID to convert to, +delimiter char

From: Patrick Welche <pwelche src gnome org>
To: commits-list gnome org
Cc:
Subject: [dasher] Generalize MandarinDasher: SPY alph stores alphID to convert to, +delimiter char
Date: Tue, 15 Mar 2011 17:12:52 +0000 (UTC)
commit 8a59cc3c4cc1f0d828fe368c99f724c00662efa6
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Tue Feb 22 11:52:43 2011 +0000

    Generalize MandarinDasher: SPY alph stores alphID to convert to, +delimiter char
    
    Add hidden attr to AlphInfo => CHAlph not shown in list (unusable as duplicates)
    MandarinTrainer uses SymbolStream => faster, support !=3byte unicode chars.
    Delimiter character read from (PY) alphabet definition; training looks for this,
         then reads 1 character PY, 1 CH. (then repeat)
    
    CAlphabetMap::SymbolStream takes AlphabetMap as param to next(), so can use
       different map for each call/symbol; also add peekAhead() and peekBack().
    
    Add hidden attr to AlphInfo => CHAlph not shown in list (unusable as duplicates)
    
    => No data specific to Mandarin in the code anymore; same Mandarin/PPMPY
     scheme/framework should be able to be used for other languages too just
     w/ appropriate alphabet(*2) + training files.

 Data/alphabets/alphabet.chineseRuby.xml |    2 +-
 Data/alphabets/alphabet.spyDict.xml     |    2 +-
 Src/DasherCore/Alphabet/AlphIO.cpp      |   21 +++++-
 Src/DasherCore/Alphabet/AlphInfo.cpp    |    1 +
 Src/DasherCore/Alphabet/AlphInfo.h      |   31 ++++++----
 Src/DasherCore/Alphabet/AlphabetMap.cpp |   99 ++++++++++++++++++++-----------
 Src/DasherCore/Alphabet/AlphabetMap.h   |   25 +++++++-
 Src/DasherCore/MandarinAlphMgr.cpp      |    6 +-
 Src/DasherCore/MandarinAlphMgr.h        |   37 +++++++++++-
 Src/DasherCore/Trainer.cpp              |   91 +++++++---------------------
 Src/DasherCore/Trainer.h                |   19 +++++-
 Src/DasherCore/TrainingHelper.cpp       |    4 +-
 12 files changed, 204 insertions(+), 134 deletions(-)
---
diff --git a/Data/alphabets/alphabet.chineseRuby.xml b/Data/alphabets/alphabet.chineseRuby.xml
index e5aeee3..0e40825 100644
--- a/Data/alphabets/alphabet.chineseRuby.xml
+++ b/Data/alphabets/alphabet.chineseRuby.xml
@@ -2,7 +2,7 @@
 <!DOCTYPE alphabets SYSTEM "alphabet.dtd">
 <?xml-stylesheet type="text/xsl" href="alphabet.xsl"?>
 <alphabets>
-<alphabet name="Chinese &#31616;&#20307;&#20013;&#25991; (simplified chinese, in pin yin groups)"> <!-- Alphabet created by David MacKay using write.p.  Thanks to Juan K Lin for help -->
+<alphabet name="Chinese &#31616;&#20307;&#20013;&#25991; (simplified chinese, in pin yin groups)" hidden="yes"> <!-- Alphabet created by David MacKay using write.p.  Thanks to Juan K Lin for help -->
 <!--ACL 10Feb2011: Pinyin characters removed (but pinyin groups retained), and "punctuation" inc. roman letters + numerals added, as per rewrite of MandarinDasher to include punctuation in context-->
 <orientation type="LR"/>
 <encoding type="Western"/>
diff --git a/Data/alphabets/alphabet.spyDict.xml b/Data/alphabets/alphabet.spyDict.xml
index f4b9fea..4e971fe 100644
--- a/Data/alphabets/alphabet.spyDict.xml
+++ b/Data/alphabets/alphabet.spyDict.xml
@@ -5,7 +5,7 @@
 <alphabet name="Chinese Super Pin Yin, grouped by Dictionary">
 <!--The nature of this alphabet is to represent all possible pin yin combinations in a list. This list is then grouped in alphabetical order so that the interface is able to produce nodes automatically. Such grouping enables the user to find the specific super pin yin (pronouciation) by indexing on the graphical interface-->
 <!--Note: the treatment of pin yin symbol uu or v in this alphabet is 'uu', (only occurs in luu, lue, nuu, lue) and grouped with the starting u strictly according to roman alphabetical order (so to type luu, one must go for 'l', then 'u', then another 'u', finally the tone)--> 
-<conversionmode id="2"/>
+<conversionmode id="2" target="Chinese &#31616;&#20307;&#20013;&#25991; (simplified chinese, in pin yin groups)" delim="&#x300B;"/>
 <orientation type="LR"/>
 <encoding type="Western"/>
 <palette>European/Asian</palette>
diff --git a/Src/DasherCore/Alphabet/AlphIO.cpp b/Src/DasherCore/Alphabet/AlphIO.cpp
index 3f2f731..1d96d23 100644
--- a/Src/DasherCore/Alphabet/AlphIO.cpp
+++ b/Src/DasherCore/Alphabet/AlphIO.cpp
@@ -115,8 +115,15 @@ void CAlphIO::GetAlphabets(std::vector <std::string >*AlphabetList) const {
   typedef std::map < std::string, const CAlphInfo* >::const_iterator CI;
   CI End = Alphabets.end();
 
-  for(CI Cur = Alphabets.begin(); Cur != End; Cur++)
+  for(CI Cur = Alphabets.begin(); Cur != End; Cur++) {
+    //skip "hidden" alphabets
+    if (Cur->second->m_bHidden) continue;
+    //for Mandarin-converting alphabets, only display if the conversion target is available too.
+    if (Cur->second->m_iConversionID==2) {
+      if (Alphabets.count(Cur->second->m_strConversionTarget)==0) continue;
+    }
     AlphabetList->push_back(Cur->second->AlphID);
+  }
 }
 
 std::string CAlphIO::GetDefault() {
@@ -430,6 +437,8 @@ void CAlphIO::XML_StartElement(void *userData, const XML_Char *name, const XML_C
         atts++;
         Me->InputInfo->AlphID = *atts;
         atts--;
+      } else if (strcmp(*atts, "hidden") == 0) {
+        Me->InputInfo->m_bHidden = (strcmp(*(atts+1), "yes")==0);
       }
       atts += 2;
     }
@@ -558,9 +567,13 @@ void CAlphIO::XML_StartElement(void *userData, const XML_Char *name, const XML_C
   if(!strcmp(name, "conversionmode")) {
     while(*atts != 0) {
       if(strcmp(*atts, "id") == 0) {
-        atts++;
-        Me->InputInfo->m_iConversionID = atoi(*atts);
-        atts--;
+        Me->InputInfo->m_iConversionID = atoi(*(atts+1));
+      } else if (strcmp(*atts, "target") == 0) {
+        Me->InputInfo->m_strConversionTarget = *(atts+1);
+      } else if (strcmp(*atts, "delim") == 0) {
+        //TODO, should check this is only a single unicode character;
+        // no training will occur, if not...
+        Me->InputInfo->m_strConversionTrainingDelimiter = *(atts+1);
       }
       atts += 2;
     }
diff --git a/Src/DasherCore/Alphabet/AlphInfo.cpp b/Src/DasherCore/Alphabet/AlphInfo.cpp
index aca380c..e9826b8 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.cpp
+++ b/Src/DasherCore/Alphabet/AlphInfo.cpp
@@ -79,6 +79,7 @@ CAlphInfo::CAlphInfo() {
   iNumChildNodes = 0;
   m_iConversionID = 0;
   m_strDefaultContext = ". ";
+  m_bHidden=false;
 }
 
 void DeleteGroups(SGroupInfo *Info) {
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 5b1a08f..2d5f799 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -108,8 +108,22 @@ public:
   
   SGroupInfo *m_pBaseGroup;
   int iNumChildNodes;
+  ///0 = normal alphabet, contains symbols to output
+  ///1 = Japanese (defunct)
+  ///2 = Mandarin: symbols are merely phonemes, and match up (via displaytext)
+  /// with groups in a second alphabet, identified by strConversionTarget,
+  /// which contains actual output symbols possibly including duplicates;
+  /// all this handled by MandarinAlphMgr (+MandarinTrainer, PPMPYLanguageModel).
   int m_iConversionID;
   
+  ///The name of the alphabet containing the actual text symbols into which
+  /// this alphabet will be converted. Only used (atm) if m_iConversionID==2.
+  std::string m_strConversionTarget;
+
+  ///Single-unicode character used to indicate an upcoming PY-then-CH pair
+  /// in the training file (see MandarinTrainer). Only used if m_iConversionID==2.
+  std::string m_strConversionTrainingDelimiter;
+  
   CAlphabetMap *MakeMap() const;
   
   ~CAlphInfo();
@@ -127,17 +141,12 @@ private:
   Opts::AlphabetTypes Encoding;
   Opts::AlphabetTypes Type;
   Opts::ScreenOrientations Orientation;
-    
-  /*     // Obsolete groups stuff */
-  /*     struct group { */
-  /*       std::string Description; */
-  /*       std::vector < character > Characters; */
-  /*       int Colour; */
-  /*       std::string Label; */
-  /*     }; */
-  /*     std::vector < group > Groups; */
-  /*     // --- */
-    
+  
+  ///If true, alphabet should not be displayed in list of available alphabets;
+  /// it exists only for internal use, e.g. as a target for conversion from
+  /// another alphabet (a la MandarinDasher).
+  bool m_bHidden;
+  
   std::vector<character> m_vCharacters;
   
   symbol iParagraphCharacter;       // symbol number (index into m_vCharacters +1) of paragraph char (for display and default edit-text), 0 for none.
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.cpp b/Src/DasherCore/Alphabet/AlphabetMap.cpp
index 2017cf6..c30f0f5 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.cpp
+++ b/Src/DasherCore/Alphabet/AlphabetMap.cpp
@@ -32,6 +32,8 @@ static char THIS_FILE[] = __FILE__;
 #endif
 #endif
 
+#define UNKNOWN_SYMBOL 0
+
 class utf8_length
 {
 public:
@@ -73,8 +75,8 @@ int utf8_length::operator[](const unsigned char i) const
 
 ////////////////////////////////////////////////////////////////////////////
 
-CAlphabetMap::SymbolStream::SymbolStream(const CAlphabetMap &_map, std::istream &_in)
-: map(_map), in(_in), pos(0), len(0) {
+CAlphabetMap::SymbolStream::SymbolStream(std::istream &_in)
+: in(_in), pos(0), len(0) {
   readMore();
 }
 
@@ -91,52 +93,79 @@ void CAlphabetMap::SymbolStream::readMore() {
   }
 }
 
-symbol CAlphabetMap::SymbolStream::next()
-{
-  int numChars;
-  
+inline int CAlphabetMap::SymbolStream::findNext() {
   for (;;) {
     if (pos + m_utf8_count_array.max_length > len && len==1024) {
       //may need more bytes for next char; and input not yet exhausted.
-      
       if (pos) {
         //shift remaining bytes to beginning
         len-=pos; //len of them
+        //memcpy isn't safe for overlapping regions of memory...
+        DASHER_ASSERT(len<pos); //...but they really shouldn't overlap!
         memcpy(buf, &buf[pos], len);
         pos=0;
       }
       readMore();
     }
     //if still don't have any chars after attempting to read more...EOF!
-    if (pos==len) return -1;
-    numChars = m_utf8_count_array[buf[pos]];
-    if (numChars != 0) break;
+    if (pos==len) return 0; //EOF
+    if (int numChars = m_utf8_count_array[buf[pos]]) {
+      if (pos+numChars > len) {
+        //no more bytes in file (would have tried to read earlier), but not enough for char
+#ifdef DEBUG
+        std::cerr << "Incomplete UTF-8 character beginning 0x" << hex << buf[pos] << dec;
+        std::cerr << "(expecting " << numChars << " bytes but only " << (len-pos) << ")" << std::endl;
+#endif
+        pos=len;
+        return 0;
+      }
+      return numChars;
+    }
 #ifdef DEBUG
     std::cerr << "Read invalid UTF-8 character 0x" << hex << buf[pos]
     << dec << std::endl;
 #endif
-    ++pos;
+    ++pos;    
   }
+}
+
+string CAlphabetMap::SymbolStream::peekAhead() {
+  int numChars=findNext();
+  return string(&buf[pos],numChars);
+}
+
+string CAlphabetMap::SymbolStream::peekBack() {
+  for(int i=pos-1; i>=0; i--) {
+    if (buf[i] & 0x80) {
+      //multibyte character...
+      if (buf[i] & 0x40) {
+        //START of multibyte character
+        int numChars = m_utf8_count_array[buf[i]];
+        DASHER_ASSERT(i+numChars==pos);
+        return string(&buf[i],numChars);
+      }
+      //in middle of multibyte, keep going back...
+    } else return string(&buf[i],1); //high bit not set -> single-byte char
+  }
+  //fail...relatively gracefully ;-)
+  return "";
+}
+
+symbol CAlphabetMap::SymbolStream::next(const CAlphabetMap *map)
+{
+  int numChars=findNext();
+  if (numChars==0) return -1; //EOF
   if (numChars == 1) {
-    if (map.m_ParagraphSymbol!=map.Undefined && buf[pos]=='\r') {
+    if (map->m_ParagraphSymbol!=UNKNOWN_SYMBOL && buf[pos]=='\r') {
       DASHER_ASSERT(pos+1<len || len<1024); //there are more characters (we should have read utf8...max_length), or else input is exhausted
       if (pos+1<len && buf[pos+1]=='\n') {
         pos+=2;
-        return map.m_ParagraphSymbol;
+        return map->m_ParagraphSymbol;
       }
     }
-    return map.GetSingleChar(buf[pos++]);
-  }
-  if (pos+numChars > len) {
-    //no more bytes in file (would have tried to read earlier), but not enough for char
-#ifdef DEBUG
-    std::cerr << "Incomplete UTF-8 character beginning 0x" << hex << buf[pos] << dec;
-    std::cerr << "(expecting " << numChars << " bytes but only " << (len-pos) << ")" << std::endl;
-#endif
-    pos=len;
-    return -1;
+    return map->GetSingleChar(buf[pos++]);
   }
-  int sym=map.Get(string(&buf[pos], numChars));
+  int sym=map->Get(string(&buf[pos], numChars));
   pos+=numChars;
   return sym;
 }
@@ -144,19 +173,19 @@ symbol CAlphabetMap::SymbolStream::next()
 void CAlphabetMap::GetSymbols(std::vector<symbol>& Symbols, const std::string& Input) const
 {
   std::istringstream in(Input);
-  SymbolStream syms(*this, in);
-  for (symbol sym; (sym=syms.next())!=-1;)
+  SymbolStream syms(in);
+  for (symbol sym; (sym=syms.next(this))!=-1;)
     Symbols.push_back(sym);
 }
 
 
 CAlphabetMap::CAlphabetMap(unsigned int InitialTableSize)
-:HashTable(InitialTableSize <<1), Undefined(0), m_ParagraphSymbol(Undefined) {
+:HashTable(InitialTableSize <<1), m_ParagraphSymbol(UNKNOWN_SYMBOL) {
   Entries.reserve(InitialTableSize);
 
   const int numChars = numeric_limits<char>::max() + 1;
   m_pSingleChars = new symbol[numChars];
-  for (int i = 0; i<numChars; i++) m_pSingleChars[i] = Undefined;
+  for (int i = 0; i<numChars; i++) m_pSingleChars[i] = UNKNOWN_SYMBOL;
 }
 
 CAlphabetMap::~CAlphabetMap() {
@@ -164,9 +193,9 @@ CAlphabetMap::~CAlphabetMap() {
 }
 
 void CAlphabetMap::AddParagraphSymbol(symbol Value) {
-  DASHER_ASSERT (m_ParagraphSymbol==Undefined);
-  DASHER_ASSERT (m_pSingleChars['\r'] == Undefined);
-  DASHER_ASSERT (m_pSingleChars['\n'] == Undefined);
+  DASHER_ASSERT (m_ParagraphSymbol==UNKNOWN_SYMBOL);
+  DASHER_ASSERT (m_pSingleChars['\r'] == UNKNOWN_SYMBOL);
+  DASHER_ASSERT (m_pSingleChars['\n'] == UNKNOWN_SYMBOL);
   m_pSingleChars['\n'] = m_ParagraphSymbol = Value;
 }
 
@@ -174,8 +203,8 @@ void CAlphabetMap::Add(const std::string &Key, symbol Value) {
   //Only single unicode-characters should be added...
   DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
   if (Key.length() == 1) {
-    DASHER_ASSERT(m_pSingleChars[Key[0]]==Undefined);
-    DASHER_ASSERT(Key[0]!='\r' || m_ParagraphSymbol==Undefined);
+    DASHER_ASSERT(m_pSingleChars[Key[0]]==UNKNOWN_SYMBOL);
+    DASHER_ASSERT(Key[0]!='\r' || m_ParagraphSymbol==UNKNOWN_SYMBOL);
     m_pSingleChars[Key[0]] = Value;
     return;
   }
@@ -212,7 +241,7 @@ void CAlphabetMap::Add(const std::string &Key, symbol Value) {
 }
 
 symbol CAlphabetMap::Get(const std::string &Key) const {
-  if (m_ParagraphSymbol!=Undefined && Key=="\r\n")
+  if (m_ParagraphSymbol!=UNKNOWN_SYMBOL && Key=="\r\n")
     return m_ParagraphSymbol;
   DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
   if (Key.length() == 1) {
@@ -225,7 +254,7 @@ symbol CAlphabetMap::Get(const std::string &Key) const {
     }
   }
 
-  return Undefined;
+  return UNKNOWN_SYMBOL;
 }
 
 symbol CAlphabetMap::GetSingleChar(char key) const {return m_pSingleChars[key];}
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.h b/Src/DasherCore/Alphabet/AlphabetMap.h
index a35470b..b9c5807 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.h
+++ b/Src/DasherCore/Alphabet/AlphabetMap.h
@@ -83,11 +83,29 @@ public:
 
   class SymbolStream {
   public:
-    SymbolStream(const CAlphabetMap &_map, std::istream &_in);
-    symbol next();
+    SymbolStream(std::istream &_in);
+    ///Gets the next symbol in the stream, using the specified AlphabetMap
+    /// to convert unicode characters to symbols.
+    /// \return 0 for unknown symbol (not in map); -1 for EOF; else symbol#.
+    symbol next(const CAlphabetMap *map);
+    
+    ///Finds the next complete character in the stream,  but does not advance past it.
+    /// Hence, repeated calls will return the same string.
+    std::string peekAhead();
+    
+    ///Returns the string representation of the previous symbol (i.e. that returned
+    /// by the previous call to next()). Undefined if next() has not been called, or
+    /// if peekAhead() has been called since the last call to next(). Does not change
+    /// the stream position. Useful for debugging.
+    std::string peekBack();
   private:
+    ///Finds beginning of next unicode character, at position 'pos' or later,
+    /// filling buffer and skipping invalid characters as necessary.
+    /// Leaves 'pos' pointing at beginning of said character.
+    /// \return the number of octets representing the next character, or 0 for EOF
+    /// (inc. where the file ends with an incomplete character)
+    inline int findNext();
     void readMore();
-    const CAlphabetMap &map;
     char buf[1024];
     int pos, len;
     std::istream &in;
@@ -142,7 +160,6 @@ private:
      */
   } std::vector < Entry > Entries;
   std::vector < Entry * >HashTable;
-  const symbol Undefined;
   symbol *m_pSingleChars;
   /// both "\r\n" and "\n" are mapped to this (if not Undefined).
   /// This is the only case where >1 character can map to a symbol.
diff --git a/Src/DasherCore/MandarinAlphMgr.cpp b/Src/DasherCore/MandarinAlphMgr.cpp
index 7a1822e..a387a9d 100644
--- a/Src/DasherCore/MandarinAlphMgr.cpp
+++ b/Src/DasherCore/MandarinAlphMgr.cpp
@@ -49,9 +49,11 @@ static char THIS_FILE[] = __FILE__;
 CMandarinAlphMgr::CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet, const CAlphabetMap *pAlphMap)
   : CAlphabetManager(pInterface, pNCManager, pAlphabet, pAlphMap),
     m_pConversionsBySymbol(new set<symbol>[GetAlphabet()->GetNumberTextSymbols()+1]) {
+  DASHER_ASSERT(pAlphabet->m_iConversionID==2);
       
   //the CHAlphabet contains a group for each SPY syllable+tone, with symbols being chinese characters.      
-  const CAlphInfo *pCHAlphabet = pInterface->GetInfo("Chinese ç®?ä½?ä¸æ?? (simplified chinese, in pin yin groups)");
+  const CAlphInfo *pCHAlphabet = pInterface->GetInfo(pAlphabet->m_strConversionTarget);
+      
   //Build a map from SPY group label, to set of chinese chars (represented as start & end of group in pCHAlphabet)
   map<string,pair<symbol,symbol> > conversions;
   //Dasher's alphabet format means that space and paragraph can't be put into groups,
@@ -124,7 +126,7 @@ void CMandarinAlphMgr::CreateLanguageModel(CEventHandler *pEventHandler, CSettin
 }
 
 CTrainer *CMandarinAlphMgr::GetTrainer() {
-  return new CMandarinTrainer(m_pLanguageModel, m_pAlphabetMap, &m_CHAlphabetMap);
+  return new CMandarinTrainer(static_cast<CPPMPYLanguageModel*>(m_pLanguageModel), m_pAlphabetMap, &m_CHAlphabetMap, m_pAlphabet->m_strConversionTrainingDelimiter);
 }
 
 CAlphabetManager::CAlphNode *CMandarinAlphMgr::GetRoot(CDasherNode *pParent, unsigned int iLower, unsigned int iUpper, bool bEnteredLast, int iOffset) {
diff --git a/Src/DasherCore/MandarinAlphMgr.h b/Src/DasherCore/MandarinAlphMgr.h
index 7a35a0b..b904179 100644
--- a/Src/DasherCore/MandarinAlphMgr.h
+++ b/Src/DasherCore/MandarinAlphMgr.h
@@ -30,11 +30,42 @@ namespace Dasher {
   /// \ingroup Model
   /// @{
 
-  /// Overides methods of AlphabetManager for changes needed for Mandarin Dasher
-  ///
+  /// Subclass of AlphabetManager, generalizing what's needed for Mandarin Dasher.
+  /// This class, along with PPMPYLanguageModel and MandarinTrainer, implements
+  /// a two-layer / dual-alphabet system for writing symbols in one alphabet (the
+  /// "Chinese" or CH alphabet) by first selecting a symbol in a different, e.g.
+  /// phonetic, alphabet (here, the "Pinyin" or PY alphabet).
+  /// The idea is to relax the usual idea that each sentence in the target (CH) alphabet
+  /// appears in exactly one place on the real line [0,1]; instead, it may appear in arbitrarily
+  /// many different (sub)intervals, but with different PY.
+  /// The possible mappings between PY and CH must be (a) fixed regardless of context, and
+  /// (b) enter precisely one CH symbol for each PY selected; but may be many-many, i.e. the same CH symbol
+  /// may appear under multiple PY symbols. It is defined by two alphabet files, tho only the
+  /// PY alphabet need be a legal alphabet definition - see constructor.
+  /// The language model treats all occurrences of the same CH symbol the same regardless of PY,
+  /// and builds a context of CH symbols only; however, for a given CH context, it predicts
+  /// both the next PY symbol, and the next CH symbol, using distinct counts. (See GetConversions).
+  /// In use, the user first navigates into a PY symbol, but this may not enter any text:
+  /// instead it may offer a choice between multiple CH symbols or "conversions";
+  /// the user navigates into one of these, which is then written, and the process repeats
+  /// (PY-CH-PY-CH...). Some PY symbols offer no choice, i.e. only a single CH symbol, in
+  /// which case the "navigate into CH" step disappears.
+  /// This class is used for alphabets (e.g. PY) with conversionid==2; the conversion target
+  /// attribute of that alphabet identifies the CH alphabet (probably hidden itself).
   class CMandarinAlphMgr : public CAlphabetManager {
   public:
-
+    /// Create a MandarinAlphabetManager!
+    /// \param pAlphabet the Pinyin alphabet. This should have a hierarchy of groups to be
+    /// displayed to the user, and symbols to be predicted by the LM, whose text attributes
+    /// are used to parse the training text. However the symbol display texts are NOT presented
+    /// to the user; instead, for each PY symbol, the CH alphabet (i.e. the PY alphabet's
+    /// "conversion target") must contain exactly one group with the same _displaytext_.
+    /// All (CH) symbols within this (inc. in subgroups, CH hierarchy ignored) are presented to the
+    /// user; however, CH symbols are identified by their _text_, i.e. (unlike normal alphabets)
+    /// the same text may appear in multiple places in the CHAlphabet, and these will be identified
+    /// together (i.e. by hashing on text). Hence, it is not possible to call makeMap() on
+    /// the CHAlphabet (this requires the text attributes to be all different), so we rehash here.
+    /// \param pAlphabetMap mapping from text to symbol# of the PY alphabet; used for training files.
     CMandarinAlphMgr(CDasherInterfaceBase *pInterface, CNodeCreationManager *pNCManager, const CAlphInfo *pAlphabet, const CAlphabetMap *pAlphMap);
     ~CMandarinAlphMgr();
     
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
index 9c84991..b34747d 100644
--- a/Src/DasherCore/Trainer.cpp
+++ b/Src/DasherCore/Trainer.cpp
@@ -24,85 +24,40 @@ CTrainer::CTrainer(CLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet
 void CTrainer::Train(CAlphabetMap::SymbolStream &syms) {
   CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
 
-  for(symbol sym; (sym=syms.next())!=-1;) {
+  for(symbol sym; (sym=syms.next(m_pAlphabet))!=-1;) {
       m_pLanguageModel->LearnSymbol(sContext, sym);
   }
   m_pLanguageModel->ReleaseContext(sContext);
 }
 
-CMandarinTrainer::CMandarinTrainer(CLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet)
-: CTrainer(pLanguageModel, pAlphabet), m_pCHAlphabet(pCHAlphabet) {
+CMandarinTrainer::CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim)
+: CTrainer(pLanguageModel, pAlphabet), m_pCHAlphabet(pCHAlphabet), m_strDelim(strDelim) {
 }
 
-//TrainMandarin is used to train Mandarin Dasher: PPMPYLanguageModel
-//Mandarin training is distinct from normal PPM training in that it uses two separate alphabets, and trains with py-character pairs. Despite so, implementation here may seem out of structure, and it could be necessary to revise later, particularly on robustness to deal with non-unicode chars
-//The training of Mandarin Dasher may evolve in to possible paths: 1.Include punctuation (more work); 2.User defined training files (not sure how); 3.Learning as one types (more work)
-//As Manager is produced, training happens in AlphabetManagerFactory
-
-void CMandarinTrainer::LoadFile(const std::string &strPath) {
-  //TrainMandarin takes in the Super Pin Yin Alphabet, and uses the Mandarin Character alphabet stored in private AlphabetManagerFactory
-  FILE * fpTrain = fopen(strPath.c_str(), "rb");
-  
-  if(!fpTrain) {
-    std::cout << "Mandarin Training File: cannot open file or incorrect directory" << std::endl;
-    return;
-  }
+void CMandarinTrainer::Train(CAlphabetMap::SymbolStream &syms) {
   unsigned numberofchar = 0;
-
-
-  const size_t charsize = 1024;
-  const size_t trainBufferSize = 3*charsize*3;
-  char szBuffer[trainBufferSize];
-    
-  std::string strChar;
-  std::string strPY;
   CLanguageModel::Context trainContext = m_pLanguageModel->CreateEmptyContext();
-  std::string pyID = "ã??";
-  std::vector<symbol> Symchar;
-  std::vector<symbol> Sympy;
-
-  while(!feof(fpTrain)){
-    
-    strPY.clear();
-    strChar.clear();
- 
-    size_t iNumBytes = fread(szBuffer, 1, trainBufferSize, fpTrain);
-    std::string strBuffer = std::string(szBuffer, iNumBytes);
-
-    size_t lim;
-    if(iNumBytes<9*charsize)
-      lim = iNumBytes/9;
-    else
-      lim = charsize;
+  
+  for (string s; (s=syms.peekAhead()).length();) {
+    syms.next(m_pAlphabet); //skip over character at which we just peeked (we don't need the symbol#)
     
-    for (size_t pos=0;;) { //position in 3's counting on 
-
-      while(pos<lim*3)
-        if (pyID.compare(strBuffer.substr(3*pos++,3))==0) break;
-      //leave pos just after the pyID symbol
-      
-      if (pos+1>=lim*3) break;
-      strPY=strBuffer.substr(3*pos++,3);
- 
-      //strBuffer.copy(ctemp,3,3*pos);
-      strChar=strBuffer.substr(3*pos++,3);
-
-      Symchar.clear();
-      Sympy.clear();
-
-      m_pCHAlphabet->GetSymbols(Symchar, strChar);
-      m_pAlphabet->GetSymbols(Sympy, strPY);      
-      DASHER_ASSERT(Symchar.size()==1);
-      DASHER_ASSERT(Sympy.size()==1);
+    if (s == m_strDelim) { //found delimiter, so process next two characters
+      symbol Sympy = syms.next(m_pAlphabet);
+      if (Sympy==-1) break; //EOF
 #ifdef DEBUG
-      if (Symchar[0]<=0)
-        std::cout << "Unknown chinese character " << strChar << std::endl;
+      if (Sympy==0)
+        std::cout << "Unknown pinyin character " << syms.peekBack() << std::endl;
 #endif
-
-      static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy[0]); 
-      m_pLanguageModel->LearnSymbol(trainContext, Symchar[0]);
-      numberofchar++;
-    }
+      symbol Symchar = syms.next(m_pCHAlphabet);
+      if (Symchar==-1) break; //EOF...ignore final Pinyin?
+#ifdef DEBUG
+      if (Symchar==0)
+        std::cout << "Unknown chinese character " << syms.peekBack() << std::endl;
+#endif
+      static_cast<CPPMPYLanguageModel *>(m_pLanguageModel)->LearnPYSymbol(trainContext, Sympy);
+      m_pLanguageModel->LearnSymbol(trainContext, Symchar);
+      numberofchar++;    
+    } //else, keep looking for delimiter
   }
-  //std::cout<<"The Length of Training file is  "<<numberofchar<<" bytes/py characters"<<std::endl;  
+  m_pLanguageModel->ReleaseContext(trainContext);
 }
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
index 017be35..407bd1f 100644
--- a/Src/DasherCore/Trainer.h
+++ b/Src/DasherCore/Trainer.h
@@ -1,7 +1,7 @@
 #ifndef __trainer_h__
 #define __trainer_h__
 
-#include "LanguageModelling/LanguageModel.h"
+#include "LanguageModelling/PPMPYLanguageModel.h"
 #include "TrainingHelper.h"
 
 namespace Dasher {
@@ -16,15 +16,28 @@ namespace Dasher {
     CLanguageModel *m_pLanguageModel;
   };
 	
+  /// Trains a PPMPYLanguageModel (dual alphabet), as for e.g. MandarinDasher.
+  /// The training file is broken down into (delimiter, PY, CH) triples, each
+  /// one unicode character. Every time a delimiter is seen,  we take the next
+  /// unicode character as a symbol (syllable+tone) in the PinYin alphabet
+  /// (identified by symbol _text_), and the character after that, as a symbol
+  /// in the final=converted=Chinese alphabet. We then skip until the next delimiter.
   class CMandarinTrainer : public CTrainer {
   public:
-    CMandarinTrainer(CLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet);
+    /// Construct a new MandarinTrainer
+    /// \param pAlphabet mapping from text to symbol# in PY alphabet
+    /// \param pCHAlphabet mapping from text to symbol# (rehashed by MandarinAlphMgr) in CHAlphabet
+    /// \param strDelim delimiter character (1 unicode, maybe >1 octet; if not, will never be matched)
+    CMandarinTrainer(CPPMPYLanguageModel *pLanguageModel, const CAlphabetMap *pAlphabet, const CAlphabetMap *pCHAlphabet, const std::string &strDelim);
 
+  protected:
     //override...
-    virtual void LoadFile(const std::string &strPath);
+    virtual void Train(CAlphabetMap::SymbolStream &syms);
     
   private:
     const CAlphabetMap *m_pCHAlphabet;
+    ///Delimiter, as above. 
+    const std::string m_strDelim;
   };
 
 }
diff --git a/Src/DasherCore/TrainingHelper.cpp b/Src/DasherCore/TrainingHelper.cpp
index 3b376c3..de72c95 100644
--- a/Src/DasherCore/TrainingHelper.cpp
+++ b/Src/DasherCore/TrainingHelper.cpp
@@ -73,7 +73,7 @@ Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName) {
       return;
     }
 
-  CAlphabetMap::SymbolStream syms(*m_pAlphabet, in);
+  CAlphabetMap::SymbolStream syms(in);
   Train(syms);
 
   in.close();
@@ -128,7 +128,7 @@ void
 Dasher::CTrainingHelper::HandleEndElement(const XML_Char *szName) {
   if(!strcmp(szName, "segment")) {
     std::istringstream in(m_strCurrentText);
-    CAlphabetMap::SymbolStream syms(*m_pAlphabet,in);
+    CAlphabetMap::SymbolStream syms(in);
     Train(syms);
     
     m_bInSegment = false;
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]