[dasher] Special-case paragraph symbol to match \n _or_ \r\n in existing text



commit 90f77a44df52e6a4f0d49579508d5206e3dc64d0
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Fri Nov 26 15:12:36 2010 +0000

    Special-case paragraph symbol to match \n _or_ \r\n in existing text
    
    (Always outputs the platform default).
    
    This is the only case in which multiple unicode chars map to one Dasher symbol.

 Src/DasherCore/Alphabet/AlphIO.cpp      |    2 -
 Src/DasherCore/Alphabet/AlphInfo.cpp    |    3 +-
 Src/DasherCore/Alphabet/AlphInfo.h      |    4 +-
 Src/DasherCore/Alphabet/AlphabetMap.cpp |   26 +++++++++++++++++++++-
 Src/DasherCore/Alphabet/AlphabetMap.h   |   15 ++++++++-----
 Src/DasherCore/AlphabetManager.cpp      |   34 ++++++++++++++++++++++++------
 Src/DasherCore/AlphabetManager.h        |    8 +++++-
 Src/DasherCore/DasherModel.cpp          |    4 +-
 8 files changed, 72 insertions(+), 24 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/AlphIO.cpp b/Src/DasherCore/Alphabet/AlphIO.cpp
index 342d7e3..4d7637f 100644
--- a/Src/DasherCore/Alphabet/AlphIO.cpp
+++ b/Src/DasherCore/Alphabet/AlphIO.cpp
@@ -476,13 +476,11 @@ void CAlphIO::XML_StartElement(void *userData, const XML_Char *name, const XML_C
   if(strcmp(name, "paragraph") == 0) {
     if (!Me->ParagraphCharacter) Me->ParagraphCharacter=new CAlphInfo::character();
     Me->ReadCharAtts(atts,*(Me->ParagraphCharacter));
-    if(Me->ParagraphCharacter->Display != "") {
 #ifdef WIN32
         Me->ParagraphCharacter->Text = "\r\n";
 #else
         Me->ParagraphCharacter->Text = "\n";
 #endif
-    }
     return;
   }
   if(strcmp(name, "control") == 0) {
diff --git a/Src/DasherCore/Alphabet/AlphInfo.cpp b/Src/DasherCore/Alphabet/AlphInfo.cpp
index 89aa9e4..74fd688 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.cpp
+++ b/Src/DasherCore/Alphabet/AlphInfo.cpp
@@ -72,9 +72,10 @@ CAlphInfo::GetColour(symbol i, int iPhase) const {
 
 CAlphabetMap *CAlphInfo::MakeMap() const {
   CAlphabetMap *map = new CAlphabetMap();
+  if (iParagraphCharacter!=0) map->AddParagraphSymbol(iParagraphCharacter);
   int i;
   for(i = 0; i < m_vCharacters.size(); i++) {
-    map->Add(m_vCharacters[i].Text, i+1); //1-indexed
+    if (i+1!=iParagraphCharacter) map->Add(m_vCharacters[i].Text, i+1); //1-indexed
   }
   //ACL I'm really not sure where conversion characters should/shouldn't be included.
   // They seemed to be included in the Alphabet Map, i.e. for reading training text via GetSymbols;
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 06bc7d1..585e197 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -68,8 +68,8 @@ private:
   };  
 public:
   /// Return number of text symbols - inc space and para, but no control/conversion start/end
-  /// Note symbol numbers are 1-indexed; 0 is reserved (for the root symbol, or for
-  /// element 0 of the probability array to contain a 0)
+  /// Note symbol numbers are 1-indexed; 0 is reserved to indicate an "unknown symbol" (-1 = End-Of-Stream),
+  /// and for element 0 of the probability array to contain a 0.
   int GetNumberTextSymbols() const {return m_vCharacters.size();}
   
   Opts::ScreenOrientations GetOrientation() const {return Orientation;} 
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.cpp b/Src/DasherCore/Alphabet/AlphabetMap.cpp
index 4ff8a6d..f27705e 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.cpp
+++ b/Src/DasherCore/Alphabet/AlphabetMap.cpp
@@ -117,8 +117,16 @@ symbol CAlphabetMap::SymbolStream::next()
 #endif
     ++pos;
   }
-  if (numChars == 1)
+  if (numChars == 1) {
+    if (map.m_ParagraphSymbol!=map.Undefined && buf[pos]=='\r') {
+      DASHER_ASSERT(pos+1<len || len<1024); //there are more characters (we should have read utf8...max_length), or else input is exhausted
+      if (pos+1<len && buf[pos+1]=='\n') {
+        pos+=2;
+        return map.m_ParagraphSymbol;
+      }
+    }
     return map.GetSingleChar(buf[pos++]);
+  }
   if (pos+numChars > len) {
     //no more bytes in file (would have tried to read earlier), but not enough for char
 #ifdef DEBUG
@@ -143,7 +151,7 @@ void CAlphabetMap::GetSymbols(std::vector<symbol>& Symbols, const std::string& I
 
 
 CAlphabetMap::CAlphabetMap(unsigned int InitialTableSize)
-:HashTable(InitialTableSize <<1), Undefined(0) {
+:HashTable(InitialTableSize <<1), Undefined(0), m_ParagraphSymbol(Undefined) {
   Entries.reserve(InitialTableSize);
 
   const int numChars = numeric_limits<char>::max() + 1;
@@ -155,8 +163,19 @@ CAlphabetMap::~CAlphabetMap() {
   delete m_pSingleChars;
 }
 
+void CAlphabetMap::AddParagraphSymbol(symbol Value) {
+  DASHER_ASSERT (m_ParagraphSymbol==Undefined);
+  DASHER_ASSERT (m_pSingleChars['\r'] == Undefined);
+  DASHER_ASSERT (m_pSingleChars['\n'] == Undefined);
+  m_pSingleChars['\n'] = m_ParagraphSymbol = Value;
+}
+
 void CAlphabetMap::Add(const std::string &Key, symbol Value) {
+  //Only single unicode-characters should be added...
+  DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
   if (Key.length() == 1) {
+    DASHER_ASSERT(m_pSingleChars[Key[0]]==Undefined);
+    DASHER_ASSERT(Key[0]!='\r' || m_ParagraphSymbol==Undefined);
     m_pSingleChars[Key[0]] = Value;
     return;
   }
@@ -195,6 +214,9 @@ void CAlphabetMap::Add(const std::string &Key, symbol Value) {
 }
 
 symbol CAlphabetMap::Get(const std::string &Key) const {
+  if (m_ParagraphSymbol!=Undefined && Key=="\r\n")
+    return m_ParagraphSymbol;
+  DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
   if (Key.length() == 1) {
 	return GetSingleChar(Key[0]);
   }
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.h b/Src/DasherCore/Alphabet/AlphabetMap.h
index 6aa2773..fedad4f 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.h
+++ b/Src/DasherCore/Alphabet/AlphabetMap.h
@@ -34,12 +34,11 @@ namespace Dasher {
 /// class, to allow creation/setup of the map).
 ///
 /// Ian clearly had reservations about this system, as follows; and I'd add
-/// that support for multi-unicode-character symbols (such as the "asdf"
-/// suggested below) is extremely dubious - both here and elsewhere (e.g.
-/// what if "asd" is also a symbol) - but we really need to clarify whether
-/// such symbols are supposed to be supported or not. Most of the fun here
-/// comes from supporting single unicode characters which are multiple
-/// octets,as we use  std::string (which works in octets) for everything...
+/// that much of the fun comes from supporting single unicode characters
+/// which are multiple octets,as we use  std::string (which works in octets)
+/// for everything...note that we do *not* support multi-unicode-character
+/// symbols (such as the "asdf" suggested below) except in the case of "\r\n"
+/// for the paragraph symbol.
 /// Anyway, Ian writes:
 ///
 /// If I were just using GCC, which comes with the CGI "STL" implementation, I would
@@ -108,6 +107,7 @@ public:
 private:
   friend class CAlphInfo;
   CAlphabetMap(unsigned int InitialTableSize = 255);
+  void AddParagraphSymbol(symbol Value);
   void Add(const std::string & Key, symbol Value);
 
   class Entry {
@@ -145,6 +145,9 @@ private:
   std::vector < Entry * >HashTable;
   const symbol Undefined;
   symbol *m_pSingleChars;
+  /// both "\r\n" and "\n" are mapped to this (if not Undefined).
+  /// This is the only case where >1 character can map to a symbol.
+  symbol m_ParagraphSymbol;
 };
 /// \}
 
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index 1c760e5..ca47332 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -170,7 +170,7 @@ CLanguageModel::Context CAlphabetManager::CAlphNode::CloneAlphContext(CLanguageM
 
 void CAlphabetManager::CSymbolNode::GetContext(CDasherInterfaceBase *pInterface, const CAlphabetMap *pAlphabetMap, vector<symbol> &vContextSymbols, int iOffset, int iLength) {
   if (!GetFlag(NF_SEEN) && iOffset+iLength-1 == offset()) {
-    if (iLength > 1) Parent()->GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength-1);
+    if (iLength > 1) Parent()->GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength-numChars());
     vContextSymbols.push_back(iSymbol);
   } else {
     CDasherNode::GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength);
@@ -303,8 +303,12 @@ CLanguageModel::Context CAlphabetManager::CreateSymbolContext(CAlphNode *pParent
 CDasherNode *CAlphabetManager::CreateSymbolNode(CAlphNode *pParent, symbol iSymbol, unsigned int iLbnd, unsigned int iHbnd) {
 
     // TODO: Exceptions / error handling in general
-
-    CAlphNode *pAlphNode = makeSymbol(pParent, pParent->offset()+1, iLbnd, iHbnd, iSymbol);
+    
+    // Uniquely, a paragraph symbol can be two characters
+    // (and we can't call numChars() on the symbol before we've constructed it!)
+    int iNewOffset = pParent->offset()+1;
+    if (m_pAlphabet->GetText(iSymbol)=="\r\n") iNewOffset++;
+    CSymbolNode *pAlphNode = makeSymbol(pParent, iNewOffset, iLbnd, iHbnd, iSymbol);
 
     //     std::stringstream ssLabel;
 
@@ -321,7 +325,7 @@ CDasherNode *CAlphabetManager::CSymbolNode::RebuildSymbol(CAlphNode *pParent, sy
   if(iSymbol == this->iSymbol) {
     SetRange(iLbnd, iHbnd);
     SetParent(pParent);
-    DASHER_ASSERT(offset() == pParent->offset() + 1);
+    DASHER_ASSERT(offset() == pParent->offset() + numChars());
     return this;
   }
   return m_pMgr->CreateSymbolNode(pParent, iSymbol, iLbnd, iHbnd);
@@ -416,16 +420,31 @@ void CAlphabetManager::AddExtras(CAlphNode *pParent, vector<unsigned int> *pCPro
   }
 }
 
-
 CAlphabetManager::CAlphNode::~CAlphNode() {
   delete m_pProbInfo;
   m_pMgr->m_pLanguageModel->ReleaseContext(iContext);
 }
 
 const std::string &CAlphabetManager::CSymbolNode::outputText() {
+  if (iSymbol == m_pMgr->m_pAlphabet->GetParagraphSymbol() && GetFlag(NF_SEEN)) {
+    //Regardless of this particular platform's definition of a newline,
+    // which is what we'd _output_, when reversing back over text
+    // which may have been produced elsewhere, we represent occurrences
+    // of _either_ \n or \r\n by a single paragraph symbol.
+    //If the alphabet has a paragraph symbol, \r is not a symbol on its own
+    // (and \n isn't a symbol other than paragraph). So look for a
+    // \r before the \n.
+    DASHER_ASSERT(m_pMgr->m_pInterface->GetContext(offset(),1)=="\n");
+    static std::string rn("\r\n"),n("\n"); //must store strings somewhere to return by reference!
+    return (m_pMgr->m_pInterface->GetContext(offset()-1,2)=="\r\n") ? rn : n;
+  }
   return mgr()->m_pAlphabet->GetText(iSymbol);
 }
 
+int CAlphabetManager::CSymbolNode::numChars() {
+  return (outputText()=="\r\n") ? 2 : 1;
+}
+
 void CAlphabetManager::CSymbolNode::Output(Dasher::VECTOR_SYMBOL_PROB* pAdded, int iNormalization) {
   //std::cout << this << " " << Parent() << ": Output at offset " << m_iOffset << " *" << m_pMgr->m_pAlphabet->GetText(t) << "* " << std::endl;
 
@@ -439,6 +458,7 @@ void CAlphabetManager::CSymbolNode::Output(Dasher::VECTOR_SYMBOL_PROB* pAdded, i
 }
 
 void CAlphabetManager::CSymbolNode::Undo(int *pNumDeleted) {
+  DASHER_ASSERT(GetFlag(NF_SEEN));
   Dasher::CEditEvent oEvent(2, outputText(), offset());
   m_pMgr->m_pNCManager->InsertEvent(&oEvent);
   if (pNumDeleted) (*pNumDeleted)++;
@@ -454,8 +474,8 @@ CDasherNode *CAlphabetManager::CGroupNode::RebuildParent() {
 }
 
 CDasherNode *CAlphabetManager::CSymbolNode::RebuildParent() {
-  //parent's offset is one less than this.
-  return CAlphNode::RebuildParent(offset()-1);
+  //parent's offset usually one less than this, but can be two for the paragraph symbol.
+  return CAlphNode::RebuildParent(offset()-numChars());
 }
 
 CDasherNode *CAlphabetManager::CAlphNode::RebuildParent(int iNewOffset) {
diff --git a/Src/DasherCore/AlphabetManager.h b/Src/DasherCore/AlphabetManager.h
index 3d7a3b7..7777bbd 100644
--- a/Src/DasherCore/AlphabetManager.h
+++ b/Src/DasherCore/AlphabetManager.h
@@ -92,9 +92,13 @@ namespace Dasher {
       const symbol iSymbol;
       virtual CDasherNode *RebuildSymbol(CAlphNode *pParent, symbol iSymbol, unsigned int iLbnd, unsigned int iHbnd);
       virtual CGroupNode *RebuildGroup(CAlphNode *pParent, const SGroupInfo *pInfo, unsigned int iLbnd, unsigned int iHbnd);
-    private:
-      virtual const std::string &outputText();
     protected:
+      virtual const std::string &outputText();
+      /// Number of unicode _characters_ (not octets) for this symbol.
+      /// Uniquely, a paragraph symbol can enter two distinct unicode characters
+      /// (i.e. '\r' and '\n'); every other symbol enters only a single 
+      /// unicode char, even if that might take >1 octet.
+      int numChars();
       ///Compatibility constructor, so that subclasses can specify their own colour & label
       CSymbolNode(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, int iColour, const std::string &strDisplayText, CAlphabetManager *pMgr, symbol _iSymbol);
     };
diff --git a/Src/DasherCore/DasherModel.cpp b/Src/DasherCore/DasherModel.cpp
index 2265bf2..d703d72 100644
--- a/Src/DasherCore/DasherModel.cpp
+++ b/Src/DasherCore/DasherModel.cpp
@@ -492,9 +492,9 @@ void CDasherModel::OutputTo(CDasherNode *pNewNode, Dasher::VECTOR_SYMBOL_PROB* p
     pNewNode->Enter();
     
     m_pLastOutput = pNewNode;
-    pNewNode->SetFlag(NF_SEEN, true);
     pNewNode->Output(pAdded, GetLongParameter(LP_NORMALIZATION));
-    
+    pNewNode->SetFlag(NF_SEEN, true); //becomes NF_SEEN after output.
+
     // If the node we are outputting is the last one in a game target sentence, then
     // notify the game mode teacher.
     if(m_bGameMode)



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]