[dasher] Special-case paragraph symbol to match \n _or_ \r\n in existing text
- From: Patrick Welche <pwelche src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [dasher] Special-case paragraph symbol to match \n _or_ \r\n in existing text
- Date: Tue, 18 Jan 2011 17:19:51 +0000 (UTC)
commit 90f77a44df52e6a4f0d49579508d5206e3dc64d0
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Fri Nov 26 15:12:36 2010 +0000
Special-case paragraph symbol to match \n _or_ \r\n in existing text
(Always outputs the platform default).
This is the only case in which multiple unicode chars map to one Dasher symbol.
Src/DasherCore/Alphabet/AlphIO.cpp | 2 -
Src/DasherCore/Alphabet/AlphInfo.cpp | 3 +-
Src/DasherCore/Alphabet/AlphInfo.h | 4 +-
Src/DasherCore/Alphabet/AlphabetMap.cpp | 26 +++++++++++++++++++++-
Src/DasherCore/Alphabet/AlphabetMap.h | 15 ++++++++-----
Src/DasherCore/AlphabetManager.cpp | 34 ++++++++++++++++++++++++------
Src/DasherCore/AlphabetManager.h | 8 +++++-
Src/DasherCore/DasherModel.cpp | 4 +-
8 files changed, 72 insertions(+), 24 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/AlphIO.cpp b/Src/DasherCore/Alphabet/AlphIO.cpp
index 342d7e3..4d7637f 100644
--- a/Src/DasherCore/Alphabet/AlphIO.cpp
+++ b/Src/DasherCore/Alphabet/AlphIO.cpp
@@ -476,13 +476,11 @@ void CAlphIO::XML_StartElement(void *userData, const XML_Char *name, const XML_C
if(strcmp(name, "paragraph") == 0) {
if (!Me->ParagraphCharacter) Me->ParagraphCharacter=new CAlphInfo::character();
Me->ReadCharAtts(atts,*(Me->ParagraphCharacter));
- if(Me->ParagraphCharacter->Display != "") {
#ifdef WIN32
Me->ParagraphCharacter->Text = "\r\n";
#else
Me->ParagraphCharacter->Text = "\n";
#endif
- }
return;
}
if(strcmp(name, "control") == 0) {
diff --git a/Src/DasherCore/Alphabet/AlphInfo.cpp b/Src/DasherCore/Alphabet/AlphInfo.cpp
index 89aa9e4..74fd688 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.cpp
+++ b/Src/DasherCore/Alphabet/AlphInfo.cpp
@@ -72,9 +72,10 @@ CAlphInfo::GetColour(symbol i, int iPhase) const {
CAlphabetMap *CAlphInfo::MakeMap() const {
CAlphabetMap *map = new CAlphabetMap();
+ if (iParagraphCharacter!=0) map->AddParagraphSymbol(iParagraphCharacter);
int i;
for(i = 0; i < m_vCharacters.size(); i++) {
- map->Add(m_vCharacters[i].Text, i+1); //1-indexed
+ if (i+1!=iParagraphCharacter) map->Add(m_vCharacters[i].Text, i+1); //1-indexed
}
//ACL I'm really not sure where conversion characters should/shouldn't be included.
// They seemed to be included in the Alphabet Map, i.e. for reading training text via GetSymbols;
diff --git a/Src/DasherCore/Alphabet/AlphInfo.h b/Src/DasherCore/Alphabet/AlphInfo.h
index 06bc7d1..585e197 100644
--- a/Src/DasherCore/Alphabet/AlphInfo.h
+++ b/Src/DasherCore/Alphabet/AlphInfo.h
@@ -68,8 +68,8 @@ private:
};
public:
/// Return number of text symbols - inc space and para, but no control/conversion start/end
- /// Note symbol numbers are 1-indexed; 0 is reserved (for the root symbol, or for
- /// element 0 of the probability array to contain a 0)
+ /// Note symbol numbers are 1-indexed; 0 is reserved to indicate an "unknown symbol" (-1 = End-Of-Stream),
+ /// and for element 0 of the probability array to contain a 0.
int GetNumberTextSymbols() const {return m_vCharacters.size();}
Opts::ScreenOrientations GetOrientation() const {return Orientation;}
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.cpp b/Src/DasherCore/Alphabet/AlphabetMap.cpp
index 4ff8a6d..f27705e 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.cpp
+++ b/Src/DasherCore/Alphabet/AlphabetMap.cpp
@@ -117,8 +117,16 @@ symbol CAlphabetMap::SymbolStream::next()
#endif
++pos;
}
- if (numChars == 1)
+ if (numChars == 1) {
+ if (map.m_ParagraphSymbol!=map.Undefined && buf[pos]=='\r') {
+ DASHER_ASSERT(pos+1<len || len<1024); //there are more characters (we should have read utf8...max_length), or else input is exhausted
+ if (pos+1<len && buf[pos+1]=='\n') {
+ pos+=2;
+ return map.m_ParagraphSymbol;
+ }
+ }
return map.GetSingleChar(buf[pos++]);
+ }
if (pos+numChars > len) {
//no more bytes in file (would have tried to read earlier), but not enough for char
#ifdef DEBUG
@@ -143,7 +151,7 @@ void CAlphabetMap::GetSymbols(std::vector<symbol>& Symbols, const std::string& I
CAlphabetMap::CAlphabetMap(unsigned int InitialTableSize)
-:HashTable(InitialTableSize <<1), Undefined(0) {
+:HashTable(InitialTableSize <<1), Undefined(0), m_ParagraphSymbol(Undefined) {
Entries.reserve(InitialTableSize);
const int numChars = numeric_limits<char>::max() + 1;
@@ -155,8 +163,19 @@ CAlphabetMap::~CAlphabetMap() {
delete m_pSingleChars;
}
+void CAlphabetMap::AddParagraphSymbol(symbol Value) {
+ DASHER_ASSERT (m_ParagraphSymbol==Undefined);
+ DASHER_ASSERT (m_pSingleChars['\r'] == Undefined);
+ DASHER_ASSERT (m_pSingleChars['\n'] == Undefined);
+ m_pSingleChars['\n'] = m_ParagraphSymbol = Value;
+}
+
void CAlphabetMap::Add(const std::string &Key, symbol Value) {
+ //Only single unicode-characters should be added...
+ DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
if (Key.length() == 1) {
+ DASHER_ASSERT(m_pSingleChars[Key[0]]==Undefined);
+ DASHER_ASSERT(Key[0]!='\r' || m_ParagraphSymbol==Undefined);
m_pSingleChars[Key[0]] = Value;
return;
}
@@ -195,6 +214,9 @@ void CAlphabetMap::Add(const std::string &Key, symbol Value) {
}
symbol CAlphabetMap::Get(const std::string &Key) const {
+ if (m_ParagraphSymbol!=Undefined && Key=="\r\n")
+ return m_ParagraphSymbol;
+ DASHER_ASSERT(m_utf8_count_array[Key[0]]==Key.length());
if (Key.length() == 1) {
return GetSingleChar(Key[0]);
}
diff --git a/Src/DasherCore/Alphabet/AlphabetMap.h b/Src/DasherCore/Alphabet/AlphabetMap.h
index 6aa2773..fedad4f 100644
--- a/Src/DasherCore/Alphabet/AlphabetMap.h
+++ b/Src/DasherCore/Alphabet/AlphabetMap.h
@@ -34,12 +34,11 @@ namespace Dasher {
/// class, to allow creation/setup of the map).
///
/// Ian clearly had reservations about this system, as follows; and I'd add
-/// that support for multi-unicode-character symbols (such as the "asdf"
-/// suggested below) is extremely dubious - both here and elsewhere (e.g.
-/// what if "asd" is also a symbol) - but we really need to clarify whether
-/// such symbols are supposed to be supported or not. Most of the fun here
-/// comes from supporting single unicode characters which are multiple
-/// octets,as we use std::string (which works in octets) for everything...
+/// that much of the fun comes from supporting single unicode characters
+/// which are multiple octets,as we use std::string (which works in octets)
+/// for everything...note that we do *not* support multi-unicode-character
+/// symbols (such as the "asdf" suggested below) except in the case of "\r\n"
+/// for the paragraph symbol.
/// Anyway, Ian writes:
///
/// If I were just using GCC, which comes with the CGI "STL" implementation, I would
@@ -108,6 +107,7 @@ public:
private:
friend class CAlphInfo;
CAlphabetMap(unsigned int InitialTableSize = 255);
+ void AddParagraphSymbol(symbol Value);
void Add(const std::string & Key, symbol Value);
class Entry {
@@ -145,6 +145,9 @@ private:
std::vector < Entry * >HashTable;
const symbol Undefined;
symbol *m_pSingleChars;
+ /// both "\r\n" and "\n" are mapped to this (if not Undefined).
+ /// This is the only case where >1 character can map to a symbol.
+ symbol m_ParagraphSymbol;
};
/// \}
diff --git a/Src/DasherCore/AlphabetManager.cpp b/Src/DasherCore/AlphabetManager.cpp
index 1c760e5..ca47332 100644
--- a/Src/DasherCore/AlphabetManager.cpp
+++ b/Src/DasherCore/AlphabetManager.cpp
@@ -170,7 +170,7 @@ CLanguageModel::Context CAlphabetManager::CAlphNode::CloneAlphContext(CLanguageM
void CAlphabetManager::CSymbolNode::GetContext(CDasherInterfaceBase *pInterface, const CAlphabetMap *pAlphabetMap, vector<symbol> &vContextSymbols, int iOffset, int iLength) {
if (!GetFlag(NF_SEEN) && iOffset+iLength-1 == offset()) {
- if (iLength > 1) Parent()->GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength-1);
+ if (iLength > 1) Parent()->GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength-numChars());
vContextSymbols.push_back(iSymbol);
} else {
CDasherNode::GetContext(pInterface, pAlphabetMap, vContextSymbols, iOffset, iLength);
@@ -303,8 +303,12 @@ CLanguageModel::Context CAlphabetManager::CreateSymbolContext(CAlphNode *pParent
CDasherNode *CAlphabetManager::CreateSymbolNode(CAlphNode *pParent, symbol iSymbol, unsigned int iLbnd, unsigned int iHbnd) {
// TODO: Exceptions / error handling in general
-
- CAlphNode *pAlphNode = makeSymbol(pParent, pParent->offset()+1, iLbnd, iHbnd, iSymbol);
+
+ // Uniquely, a paragraph symbol can be two characters
+ // (and we can't call numChars() on the symbol before we've constructed it!)
+ int iNewOffset = pParent->offset()+1;
+ if (m_pAlphabet->GetText(iSymbol)=="\r\n") iNewOffset++;
+ CSymbolNode *pAlphNode = makeSymbol(pParent, iNewOffset, iLbnd, iHbnd, iSymbol);
// std::stringstream ssLabel;
@@ -321,7 +325,7 @@ CDasherNode *CAlphabetManager::CSymbolNode::RebuildSymbol(CAlphNode *pParent, sy
if(iSymbol == this->iSymbol) {
SetRange(iLbnd, iHbnd);
SetParent(pParent);
- DASHER_ASSERT(offset() == pParent->offset() + 1);
+ DASHER_ASSERT(offset() == pParent->offset() + numChars());
return this;
}
return m_pMgr->CreateSymbolNode(pParent, iSymbol, iLbnd, iHbnd);
@@ -416,16 +420,31 @@ void CAlphabetManager::AddExtras(CAlphNode *pParent, vector<unsigned int> *pCPro
}
}
-
CAlphabetManager::CAlphNode::~CAlphNode() {
delete m_pProbInfo;
m_pMgr->m_pLanguageModel->ReleaseContext(iContext);
}
const std::string &CAlphabetManager::CSymbolNode::outputText() {
+ if (iSymbol == m_pMgr->m_pAlphabet->GetParagraphSymbol() && GetFlag(NF_SEEN)) {
+ //Regardless of this particular platform's definition of a newline,
+ // which is what we'd _output_, when reversing back over text
+ // which may have been produced elsewhere, we represent occurrences
+ // of _either_ \n or \r\n by a single paragraph symbol.
+ //If the alphabet has a paragraph symbol, \r is not a symbol on its own
+ // (and \n isn't a symbol other than paragraph). So look for a
+ // \r before the \n.
+ DASHER_ASSERT(m_pMgr->m_pInterface->GetContext(offset(),1)=="\n");
+ static std::string rn("\r\n"),n("\n"); //must store strings somewhere to return by reference!
+ return (m_pMgr->m_pInterface->GetContext(offset()-1,2)=="\r\n") ? rn : n;
+ }
return mgr()->m_pAlphabet->GetText(iSymbol);
}
+int CAlphabetManager::CSymbolNode::numChars() {
+ return (outputText()=="\r\n") ? 2 : 1;
+}
+
void CAlphabetManager::CSymbolNode::Output(Dasher::VECTOR_SYMBOL_PROB* pAdded, int iNormalization) {
//std::cout << this << " " << Parent() << ": Output at offset " << m_iOffset << " *" << m_pMgr->m_pAlphabet->GetText(t) << "* " << std::endl;
@@ -439,6 +458,7 @@ void CAlphabetManager::CSymbolNode::Output(Dasher::VECTOR_SYMBOL_PROB* pAdded, i
}
void CAlphabetManager::CSymbolNode::Undo(int *pNumDeleted) {
+ DASHER_ASSERT(GetFlag(NF_SEEN));
Dasher::CEditEvent oEvent(2, outputText(), offset());
m_pMgr->m_pNCManager->InsertEvent(&oEvent);
if (pNumDeleted) (*pNumDeleted)++;
@@ -454,8 +474,8 @@ CDasherNode *CAlphabetManager::CGroupNode::RebuildParent() {
}
CDasherNode *CAlphabetManager::CSymbolNode::RebuildParent() {
- //parent's offset is one less than this.
- return CAlphNode::RebuildParent(offset()-1);
+ //parent's offset usually one less than this, but can be two for the paragraph symbol.
+ return CAlphNode::RebuildParent(offset()-numChars());
}
CDasherNode *CAlphabetManager::CAlphNode::RebuildParent(int iNewOffset) {
diff --git a/Src/DasherCore/AlphabetManager.h b/Src/DasherCore/AlphabetManager.h
index 3d7a3b7..7777bbd 100644
--- a/Src/DasherCore/AlphabetManager.h
+++ b/Src/DasherCore/AlphabetManager.h
@@ -92,9 +92,13 @@ namespace Dasher {
const symbol iSymbol;
virtual CDasherNode *RebuildSymbol(CAlphNode *pParent, symbol iSymbol, unsigned int iLbnd, unsigned int iHbnd);
virtual CGroupNode *RebuildGroup(CAlphNode *pParent, const SGroupInfo *pInfo, unsigned int iLbnd, unsigned int iHbnd);
- private:
- virtual const std::string &outputText();
protected:
+ virtual const std::string &outputText();
+ /// Number of unicode _characters_ (not octets) for this symbol.
+ /// Uniquely, a paragraph symbol can enter two distinct unicode characters
+ /// (i.e. '\r' and '\n'); every other symbol enters only a single
+ /// unicode char, even if that might take >1 octet.
+ int numChars();
///Compatibility constructor, so that subclasses can specify their own colour & label
CSymbolNode(CDasherNode *pParent, int iOffset, unsigned int iLbnd, unsigned int iHbnd, int iColour, const std::string &strDisplayText, CAlphabetManager *pMgr, symbol _iSymbol);
};
diff --git a/Src/DasherCore/DasherModel.cpp b/Src/DasherCore/DasherModel.cpp
index 2265bf2..d703d72 100644
--- a/Src/DasherCore/DasherModel.cpp
+++ b/Src/DasherCore/DasherModel.cpp
@@ -492,9 +492,9 @@ void CDasherModel::OutputTo(CDasherNode *pNewNode, Dasher::VECTOR_SYMBOL_PROB* p
pNewNode->Enter();
m_pLastOutput = pNewNode;
- pNewNode->SetFlag(NF_SEEN, true);
pNewNode->Output(pAdded, GetLongParameter(LP_NORMALIZATION));
-
+ pNewNode->SetFlag(NF_SEEN, true); //becomes NF_SEEN after output.
+
// If the node we are outputting is the last one in a game target sentence, then
// notify the game mode teacher.
if(m_bGameMode)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]