[dasher: 3/5] Removed Alphabet::GetSymbols(vector, stream) - use SymbolStream instead
- From: Patrick Welche <pwelche src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [dasher: 3/5] Removed Alphabet::GetSymbols(vector, stream) - use SymbolStream instead
- Date: Thu, 7 Jan 2010 14:20:46 +0000 (UTC)
commit a449ba6d0d9c958d9b56e87afd0926992182d9ec
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Tue Dec 22 10:34:54 2009 +0000
Removed Alphabet::GetSymbols(vector, stream) - use SymbolStream instead
Converts one UTF8-character at a time, avoiding creating a huge vector<symbol>
Src/DasherCore/Alphabet/Alphabet.cpp | 87 +++++++++++++++++++++------------
Src/DasherCore/Alphabet/Alphabet.h | 15 +++++-
Src/DasherCore/Trainer.cpp | 6 +-
Src/DasherCore/Trainer.h | 5 +-
Src/DasherCore/TrainingHelper.cpp | 13 ++---
Src/DasherCore/TrainingHelper.h | 2 +-
6 files changed, 81 insertions(+), 47 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/Alphabet.cpp b/Src/DasherCore/Alphabet/Alphabet.cpp
index 328a1ab..ef6db05 100644
--- a/Src/DasherCore/Alphabet/Alphabet.cpp
+++ b/Src/DasherCore/Alphabet/Alphabet.cpp
@@ -149,46 +149,69 @@ int CAlphabet::utf8_length::operator[](const int i) const
return utf8_count_array[i];
}
-void CAlphabet::GetSymbols(std::vector<symbol> &symbols, std::istream &in) const
+CAlphabet::SymbolStream::SymbolStream(const CAlphabet *pAlph, std::istream &_in)
+: map(pAlph->TextMap), in(_in), pos(0), len(0) {
+ readMore();
+}
+
+void CAlphabet::SymbolStream::readMore() {
+ //len is first unfilled byte
+ in.read(&buf[len], 1024-len);
+ if (in.good()) {
+ DASHER_ASSERT(in.gcount() == 1024-len);
+ len = 1024;
+ } else {
+ len+=in.gcount();
+ DASHER_ASSERT(len<1024);
+ //next attempt to read more will fail.
+ }
+}
+
+symbol CAlphabet::SymbolStream::next()
{
- char skip, *utfchar = new char[m_utf8_count_array.max_length + 1];
- symbol sym;
- int len, ch = in.peek();
- while (!in.eof())
- {
- len = m_utf8_count_array[ch];
- if (len == 0)
- {
+ if (pos + m_utf8_count_array.max_length > len && len==1024) {
+ //may need more bytes for next char; and input not yet exhausted.
+
+ if (pos) {
+ //shift remaining bytes to beginning
+ len-=pos; //len of them
+ memcpy(buf, &buf[pos], len);
+ pos=0;
+ }
+ readMore();
+ }
+ //if still don't have any chars after attempting to read more...EOF!
+ if (pos==len) return -1;
+ int numChars;
+ for (;;) {
+ numChars = m_utf8_count_array[buf[pos]];
+ if (numChars != 0) break;
#ifdef DEBUG
- std::cerr << "Read invalid UTF-8 character 0x" << hex << ch
- << dec << std::endl;
+ std::cerr << "Read invalid UTF-8 character 0x" << hex << buf[pos] << dec << std::endl;
#endif
- in >> skip;
- }
- else
- {
- if (len == 1)
- {
- in.ignore(1);
- sym = TextMap.GetSingleChar(ch);
- }
- else
- {
- in.read(utfchar, len);
- utfchar[len] = '\0';
- sym = TextMap.Get(string(utfchar));
- }
- symbols.push_back(sym);
- }
- ch = in.peek();
- }
- delete [] utfchar;
+ }
+ if (numChars == 1)
+ return map.GetSingleChar(buf[pos++]);
+ if (pos+numChars > len) {
+ //no more bytes in file (would have tried to read earlier), but not enough for char
+#ifdef DEBUG
+ std::cerr << "Incomplete UTF-8 character beginning 0x" << hex << buf[pos] << dec;
+ std::cerr << "(expecting " << numChars << " bytes but only " << (len-pos) << ")" << std::endl;
+#endif
+ pos=len;
+ return -1;
+ }
+ int sym=map.Get(string(&buf[pos], numChars));
+ pos+=numChars;
+ return sym;
}
void CAlphabet::GetSymbols(std::vector<symbol>& Symbols, const std::string& Input) const
{
std::istringstream in(Input);
- GetSymbols(Symbols, in);
+ SymbolStream syms(this, in);
+ for (symbol sym; (sym=syms.next())!=-1;)
+ Symbols.push_back(sym);
}
// add single char to the character set
diff --git a/Src/DasherCore/Alphabet/Alphabet.h b/Src/DasherCore/Alphabet/Alphabet.h
index da5fe49..3bc0fe0 100644
--- a/Src/DasherCore/Alphabet/Alphabet.h
+++ b/Src/DasherCore/Alphabet/Alphabet.h
@@ -105,12 +105,25 @@ namespace Dasher {
//int get_group(symbol i) const {return m_Group[i];}
// return group membership of i'th symbol
+
+ class SymbolStream {
+ public:
+ SymbolStream(const CAlphabet *pAlph, std::istream &_in);
+ symbol next();
+ private:
+ void readMore();
+ const alphabet_map ↦
+ char buf[1024];
+ int pos, len;
+ std::istream ∈
+ };
+
// Fills Symbols with the symbols corresponding to Input. {{{ Note that this
// is not necessarily reversible by repeated use of GetText. Some text
// may not be recognised and so discarded. }}}
void GetSymbols(std::vector<symbol> &Symbols, const std::string &Input) const;
- void GetSymbols(std::vector<symbol> &symbols, std::istream &in) const;
+ //SymbolStream *GetSymbols(std::istream &in) const;
void Trace() const; // diagnostic
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
index ad26e90..85123fd 100644
--- a/Src/DasherCore/Trainer.cpp
+++ b/Src/DasherCore/Trainer.cpp
@@ -21,11 +21,11 @@ CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet)
: CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel) {
}
-void CTrainer::Train(const std::vector<symbol> &vSymbols) {
+void CTrainer::Train(CAlphabet::SymbolStream &syms) {
CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
- for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
- m_pLanguageModel->LearnSymbol(sContext, *it);
+ for(symbol sym; (sym=syms.next())!=-1;) {
+ m_pLanguageModel->LearnSymbol(sContext, sym);
}
m_pLanguageModel->ReleaseContext(sContext);
}
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
index d8622c9..4ae11cf 100644
--- a/Src/DasherCore/Trainer.h
+++ b/Src/DasherCore/Trainer.h
@@ -5,7 +5,6 @@
#include "TrainingHelper.h"
namespace Dasher {
- class CAlphabet;
class CAlphIO;
class CDasherInterfaceBase;
@@ -14,8 +13,8 @@ namespace Dasher {
CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet);
protected:
- virtual void Train(const std::vector<symbol> &vSymbols);
- CLanguageModel *m_pLanguageModel;
+ virtual void Train(CAlphabet::SymbolStream &syms);
+ CLanguageModel *m_pLanguageModel;
};
class CMandarinTrainer : public CTrainer {
diff --git a/Src/DasherCore/TrainingHelper.cpp b/Src/DasherCore/TrainingHelper.cpp
index cf26ffa..c8f9936 100644
--- a/Src/DasherCore/TrainingHelper.cpp
+++ b/Src/DasherCore/TrainingHelper.cpp
@@ -27,6 +27,7 @@
#include <ios>
#include <iostream>
#include <vector>
+#include <sstream>
//using namespace Dasher;
@@ -72,10 +73,8 @@ Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName) {
return;
}
- std::vector<Dasher::symbol> vSymbols;
- vSymbols.clear();
- m_pAlphabet->GetSymbols(vSymbols, in);
- Train(vSymbols);
+ CAlphabet::SymbolStream syms(m_pAlphabet, in);
+ Train(syms);
in.close();
}
@@ -128,9 +127,9 @@ Dasher::CTrainingHelper::HandleStartElement(const XML_Char *szName,
void
Dasher::CTrainingHelper::HandleEndElement(const XML_Char *szName) {
if(!strcmp(szName, "segment")) {
- std::vector<Dasher::symbol> vSymbols;
- m_pAlphabet->GetSymbols(vSymbols, m_strCurrentText);
- Train(vSymbols);
+ std::istringstream in(m_strCurrentText);
+ CAlphabet::SymbolStream syms(m_pAlphabet,in);
+ Train(syms);
m_bInSegment = false;
}
diff --git a/Src/DasherCore/TrainingHelper.h b/Src/DasherCore/TrainingHelper.h
index 750cb6f..02d0b41 100644
--- a/Src/DasherCore/TrainingHelper.h
+++ b/Src/DasherCore/TrainingHelper.h
@@ -44,7 +44,7 @@ namespace Dasher {
protected:
const Dasher::CAlphabet *m_pAlphabet;
- virtual void Train(const std::vector<symbol> &symbols)=0;
+ virtual void Train(CAlphabet::SymbolStream &syms)=0;
private:
void LoadPlain(const std::string &strFileName);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]