[dasher: 3/5] Removed Alphabet::GetSymbols(vector, stream) - use SymbolStream instead



commit a449ba6d0d9c958d9b56e87afd0926992182d9ec
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Tue Dec 22 10:34:54 2009 +0000

    Removed Alphabet::GetSymbols(vector, stream) - use SymbolStream instead
    
    Converts one UTF8-character at a time, avoiding creating a huge vector<symbol>

 Src/DasherCore/Alphabet/Alphabet.cpp |   87 +++++++++++++++++++++------------
 Src/DasherCore/Alphabet/Alphabet.h   |   15 +++++-
 Src/DasherCore/Trainer.cpp           |    6 +-
 Src/DasherCore/Trainer.h             |    5 +-
 Src/DasherCore/TrainingHelper.cpp    |   13 ++---
 Src/DasherCore/TrainingHelper.h      |    2 +-
 6 files changed, 81 insertions(+), 47 deletions(-)
---
diff --git a/Src/DasherCore/Alphabet/Alphabet.cpp b/Src/DasherCore/Alphabet/Alphabet.cpp
index 328a1ab..ef6db05 100644
--- a/Src/DasherCore/Alphabet/Alphabet.cpp
+++ b/Src/DasherCore/Alphabet/Alphabet.cpp
@@ -149,46 +149,69 @@ int CAlphabet::utf8_length::operator[](const int i) const
   return utf8_count_array[i];
 }
 
-void CAlphabet::GetSymbols(std::vector<symbol> &symbols, std::istream &in) const
+CAlphabet::SymbolStream::SymbolStream(const CAlphabet *pAlph, std::istream &_in)
+: map(pAlph->TextMap), in(_in), pos(0), len(0) {
+  readMore();
+}
+
+void CAlphabet::SymbolStream::readMore() {
+  //len is first unfilled byte
+  in.read(&buf[len], 1024-len);
+  if (in.good()) {
+    DASHER_ASSERT(in.gcount() == 1024-len);
+    len = 1024;
+  } else {
+    len+=in.gcount();
+    DASHER_ASSERT(len<1024);
+    //next attempt to read more will fail.
+  }
+}
+
+symbol CAlphabet::SymbolStream::next()
 {
-  char skip, *utfchar = new char[m_utf8_count_array.max_length + 1];
-  symbol sym;
-  int len, ch = in.peek();
-  while (!in.eof())
-    {
-      len = m_utf8_count_array[ch];
-      if (len == 0)
-        {
+  if (pos + m_utf8_count_array.max_length > len && len==1024) {
+    //may need more bytes for next char; and input not yet exhausted.
+
+    if (pos) {
+      //shift remaining bytes to beginning
+      len-=pos; //len of them
+      memcpy(buf, &buf[pos], len);
+      pos=0;
+    }
+    readMore();
+  }
+  //if still don't have any chars after attempting to read more...EOF!
+  if (pos==len) return -1;
+  int numChars;
+  for (;;) {
+    numChars = m_utf8_count_array[buf[pos]];
+    if (numChars != 0) break;
 #ifdef DEBUG
-          std::cerr << "Read invalid UTF-8 character 0x" << hex << ch
-                    << dec << std::endl;
+    std::cerr << "Read invalid UTF-8 character 0x" << hex << buf[pos] << dec << std::endl;
 #endif
-          in >> skip;
-        }
-      else
-        {
-          if (len == 1)
-            {
-              in.ignore(1);
-              sym = TextMap.GetSingleChar(ch);
-            }
-          else
-            {
-              in.read(utfchar, len);
-              utfchar[len] = '\0';
-              sym = TextMap.Get(string(utfchar));
-            }
-          symbols.push_back(sym);
-        }
-      ch = in.peek();
-    }
-  delete [] utfchar;
+  }
+  if (numChars == 1)
+    return map.GetSingleChar(buf[pos++]);
+  if (pos+numChars > len) {
+    //no more bytes in file (would have tried to read earlier), but not enough for char
+#ifdef DEBUG
+    std::cerr << "Incomplete UTF-8 character beginning 0x" << hex << buf[pos] << dec;
+    std::cerr << "(expecting " << numChars << " bytes but only " << (len-pos) << ")" << std::endl;
+#endif
+    pos=len;
+    return -1;
+  }
+  int sym=map.Get(string(&buf[pos], numChars));
+  pos+=numChars;
+  return sym;
 }
 
 void CAlphabet::GetSymbols(std::vector<symbol>& Symbols, const std::string& Input) const
 {
   std::istringstream in(Input);
-  GetSymbols(Symbols, in);
+  SymbolStream syms(this, in);
+  for (symbol sym; (sym=syms.next())!=-1;)
+    Symbols.push_back(sym);
 }
 
 // add single char to the character set
diff --git a/Src/DasherCore/Alphabet/Alphabet.h b/Src/DasherCore/Alphabet/Alphabet.h
index da5fe49..3bc0fe0 100644
--- a/Src/DasherCore/Alphabet/Alphabet.h
+++ b/Src/DasherCore/Alphabet/Alphabet.h
@@ -105,12 +105,25 @@ namespace Dasher {
 
     //int get_group(symbol i) const {return m_Group[i];}                
     // return group membership of i'th symbol
+    
+    class SymbolStream {
+    public:
+      SymbolStream(const CAlphabet *pAlph, std::istream &_in);
+      symbol next();
+    private:
+      void readMore();
+      const alphabet_map &map;
+      char buf[1024];
+      int pos, len;
+      std::istream &in;
+    };
+    
     // Fills Symbols with the symbols corresponding to Input. {{{ Note that this
     // is not necessarily reversible by repeated use of GetText. Some text
     // may not be recognised and so discarded. }}}
 
     void GetSymbols(std::vector<symbol> &Symbols, const std::string &Input) const;
-    void GetSymbols(std::vector<symbol> &symbols, std::istream &in) const;
+    //SymbolStream *GetSymbols(std::istream &in) const;
 
     void Trace() const;         // diagnostic
 
diff --git a/Src/DasherCore/Trainer.cpp b/Src/DasherCore/Trainer.cpp
index ad26e90..85123fd 100644
--- a/Src/DasherCore/Trainer.cpp
+++ b/Src/DasherCore/Trainer.cpp
@@ -21,11 +21,11 @@ CTrainer::CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet)
   : CTrainingHelper(pAlphabet), m_pLanguageModel(pLanguageModel) {
 }
 
-void CTrainer::Train(const std::vector<symbol> &vSymbols) {
+void CTrainer::Train(CAlphabet::SymbolStream &syms) {
   CLanguageModel::Context sContext = m_pLanguageModel->CreateEmptyContext();
 
-  for(std::vector<symbol>::const_iterator it(vSymbols.begin()); it != vSymbols.end(); ++it) {
-      m_pLanguageModel->LearnSymbol(sContext, *it);
+  for(symbol sym; (sym=syms.next())!=-1;) {
+      m_pLanguageModel->LearnSymbol(sContext, sym);
   }
   m_pLanguageModel->ReleaseContext(sContext);
 }
diff --git a/Src/DasherCore/Trainer.h b/Src/DasherCore/Trainer.h
index d8622c9..4ae11cf 100644
--- a/Src/DasherCore/Trainer.h
+++ b/Src/DasherCore/Trainer.h
@@ -5,7 +5,6 @@
 #include "TrainingHelper.h"
 
 namespace Dasher {
-  class CAlphabet;
   class CAlphIO;
   class CDasherInterfaceBase;
 	
@@ -14,8 +13,8 @@ namespace Dasher {
     CTrainer(CLanguageModel *pLanguageModel, CAlphabet *pAlphabet);
 
   protected:
-	virtual void Train(const std::vector<symbol> &vSymbols);
-	CLanguageModel *m_pLanguageModel;
+    virtual void Train(CAlphabet::SymbolStream &syms);
+    CLanguageModel *m_pLanguageModel;
   };
 	
   class CMandarinTrainer : public CTrainer {
diff --git a/Src/DasherCore/TrainingHelper.cpp b/Src/DasherCore/TrainingHelper.cpp
index cf26ffa..c8f9936 100644
--- a/Src/DasherCore/TrainingHelper.cpp
+++ b/Src/DasherCore/TrainingHelper.cpp
@@ -27,6 +27,7 @@
 #include <ios>
 #include <iostream>
 #include <vector>
+#include <sstream>
 
 //using namespace Dasher;
 
@@ -72,10 +73,8 @@ Dasher::CTrainingHelper::LoadPlain(const std::string &strFileName) {
       return;
     }
 
-  std::vector<Dasher::symbol> vSymbols;
-  vSymbols.clear();
-  m_pAlphabet->GetSymbols(vSymbols, in);
-  Train(vSymbols);
+  CAlphabet::SymbolStream syms(m_pAlphabet, in);
+  Train(syms);
 
   in.close();
 }
@@ -128,9 +127,9 @@ Dasher::CTrainingHelper::HandleStartElement(const XML_Char *szName,
 void 
 Dasher::CTrainingHelper::HandleEndElement(const XML_Char *szName) {
   if(!strcmp(szName, "segment")) {
-    std::vector<Dasher::symbol> vSymbols;
-    m_pAlphabet->GetSymbols(vSymbols, m_strCurrentText);
-    Train(vSymbols);
+    std::istringstream in(m_strCurrentText);
+    CAlphabet::SymbolStream syms(m_pAlphabet,in);
+    Train(syms);
     
     m_bInSegment = false;
   }
diff --git a/Src/DasherCore/TrainingHelper.h b/Src/DasherCore/TrainingHelper.h
index 750cb6f..02d0b41 100644
--- a/Src/DasherCore/TrainingHelper.h
+++ b/Src/DasherCore/TrainingHelper.h
@@ -44,7 +44,7 @@ namespace Dasher {
   protected:
     const Dasher::CAlphabet *m_pAlphabet;
 
-	virtual void Train(const std::vector<symbol> &symbols)=0;
+    virtual void Train(CAlphabet::SymbolStream &syms)=0;
 	  
   private:  
 	void LoadPlain(const std::string &strFileName);



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]