[dasher: 8/16] Store PPMnode's children in array/hash



commit bbce9fa27e57104d63101ca003fb0c65a1e34b36
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date:   Wed Nov 4 18:10:59 2009 +0000

    Store PPMnode's children in array/hash
    
    Single child stored as direct pointer; then brute-force through unordered array;
    then closed hash-table; then direct indexing. (Chosen according to #children).

 .../LanguageModelling/PPMLanguageModel.cpp         |   91 ++++++++++++++++++--
 .../LanguageModelling/PPMLanguageModel.h           |   69 ++++++++++-----
 2 files changed, 131 insertions(+), 29 deletions(-)
---
diff --git a/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp b/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
index 0f5b1c8..39b47ec 100644
--- a/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
+++ b/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
@@ -10,6 +10,7 @@
 #include "PPMLanguageModel.h"
 
 #include <math.h>
+#include <string.h>
 #include <stack>
 #include <sstream>
 #include <iostream>
@@ -368,21 +369,95 @@ bool CPPMLanguageModel::CPPMnode::eq(CPPMLanguageModel::CPPMnode *other, std::ma
   return true;
 }
 
+#define MAX_RUN 4
+
 CPPMLanguageModel::CPPMnode * CPPMLanguageModel::CPPMnode::find_symbol(symbol sym) const
 // see if symbol is a child of node
 {
+  if (m_iNumChildSlots < 0) //negative to mean "full alphabet", use direct indexing
+    return m_ppChildren[sym];
+  if (m_iNumChildSlots == 1) {
+    if (m_pChild->sym == sym)
+      return m_pChild;
+    return 0;
+  }
+  if (m_iNumChildSlots <= MAX_RUN) {
+    for (int i = 0; i < m_iNumChildSlots && m_ppChildren[i]; i++)
+      if (m_ppChildren[i]->sym == sym) return m_ppChildren[i];
+    return 0;
+  }
   //  printf("finding symbol %d at node %d\n",sym,node->id);
-  for (ChildIterator search = children(); search != end(); search++) {
-    if((*search)->sym == sym) {
-      return *search;
+
+  for (int i = sym; ; i++) { //search through elements which have overflowed into subsequent slots
+    CPPMnode *found = this->m_ppChildren[i % m_iNumChildSlots]; //wrap round
+    if (!found) return 0; //null element
+    if(found->sym == sym) {
+      return found;
     }
   }
   return 0;
 }
 
-void CPPMLanguageModel::CPPMnode::AddChild(CPPMnode *pNewChild) {
-  pNewChild->next = this->child;
-  this->child = pNewChild;
+void CPPMLanguageModel::CPPMnode::AddChild(CPPMnode *pNewChild, int numSymbols) {
+  if (m_iNumChildSlots < 0) {
+    m_ppChildren[pNewChild->sym] = pNewChild;
+  }
+  else 
+  {
+    if (m_iNumChildSlots == 0) {
+      m_iNumChildSlots = 1;
+      m_pChild = pNewChild;
+      return;
+    } else if (m_iNumChildSlots == 1) {
+      //no room, have to resize...
+    } else if (m_iNumChildSlots<=MAX_RUN) {
+      for (int i = 0; i < m_iNumChildSlots; i++)
+        if (!m_ppChildren[i]) {
+          m_ppChildren[i] = pNewChild;
+          return;
+        }
+    } else {
+      
+
+      int start = pNewChild->sym;
+      //find length of run (including to-be-inserted element)....
+      while (m_ppChildren[start = (start + m_iNumChildSlots - 1) % m_iNumChildSlots]);
+
+      int idx = pNewChild->sym;
+      while (m_ppChildren[idx %= m_iNumChildSlots]) ++idx;
+      //found NULL
+      int stop = idx;
+      while (m_ppChildren[stop = (stop + 1) % m_iNumChildSlots]);
+      //start and idx point to NULLs (with inserted element somewhere inbetween)
+      
+      int runLen = (m_iNumChildSlots + stop - (start+1)) % m_iNumChildSlots;
+      if (runLen <= MAX_RUN) {
+        //ok, maintain size
+        m_ppChildren[idx] = pNewChild;
+        return;
+      }
+    }
+    //resize!
+    CPPMnode **oldChildren = m_ppChildren;
+    int oldSlots = m_iNumChildSlots;
+    int newNumElems;
+    if (m_iNumChildSlots >= numSymbols/4) {
+      m_iNumChildSlots = -numSymbols; // negative = "use direct indexing"
+      newNumElems = numSymbols;
+    } else {
+      m_iNumChildSlots+=m_iNumChildSlots+1;
+      newNumElems = m_iNumChildSlots;
+    }
+    m_ppChildren = new CPPMnode *[newNumElems]; //null terminator
+    memset (m_ppChildren, 0, sizeof(CPPMnode *)*newNumElems);
+    if (oldSlots == 1)
+      AddChild((CPPMnode *)oldChildren, numSymbols);
+    else {
+      while (oldSlots-- > 0) if (oldChildren[oldSlots]) AddChild(oldChildren[oldSlots], numSymbols);
+      delete oldChildren;
+    }
+    AddChild(pNewChild, numSymbols);
+  }
 }
 
 CPPMLanguageModel::CPPMnode * CPPMLanguageModel::AddSymbolToNode(CPPMnode *pNode, symbol sym, int *update) {
@@ -405,7 +480,7 @@ CPPMLanguageModel::CPPMnode * CPPMLanguageModel::AddSymbolToNode(CPPMnode *pNode
 
   pReturn = m_NodeAlloc.Alloc();        // count is initialized to 1
   pReturn->sym = sym;
-  pNode->AddChild(pReturn);
+  pNode->AddChild(pReturn,GetSize());
 
   ++NodesAllocated;
 
@@ -509,7 +584,7 @@ bool CPPMLanguageModel::ReadFromFile(std::string strFilename) {
     std::map<int,CPPMnode *>::iterator it(parentMap.find(sBR.m_iIndex));
     if (it != parentMap.end()) {
       CPPMnode *parent = it->second;
-      parent->AddChild(pCurrent);
+      parent->AddChild(pCurrent,GetSize());
       //erase the record of parent hood, now we've realized it
       parentMap.erase(it);
       //add mapping for the _next_ sibling; since siblings will be read in the order
diff --git a/Src/DasherCore/LanguageModelling/PPMLanguageModel.h b/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
index 6445900..4edac69 100644
--- a/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
+++ b/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
@@ -14,6 +14,7 @@
 
 #include "LanguageModel.h"
 
+#include "stdlib.h"
 #include <vector>
 #include <fstream>
 #include <set>
@@ -31,33 +32,48 @@ namespace Dasher {
     class ChildIterator;
     class CPPMnode {
     private:
-      CPPMnode *child;
-      CPPMnode *next;
-      friend class ChildIterator;
+      union {
+        CPPMnode **m_ppChildren;
+        CPPMnode *m_pChild;
+      };
+      ///Elements in above array, including nulls, as follows:
+      /// (a) negative -> absolute value is number of elems in m_ppChildren, but use direct indexing
+      /// (b) 1 -> use m_pChild as direct pointer to CPPMnode (no array)
+      /// (c) 2-MAX_RUN -> m_ppChildren is unordered array of that many elems
+      /// (d) >MAX_RUN ->  m_ppChildren is an inline hash (overflow to next elem) with that many slots
+      int m_iNumChildSlots;
+      friend class CPPMLanguageModel;
 	  public:
       ChildIterator children() const;
-      ChildIterator end() const;
-      void AddChild(CPPMnode *pNewChild);
+      const ChildIterator end() const;
+      void AddChild(CPPMnode *pNewChild, int numSymbols);
       CPPMnode * find_symbol(symbol sym)const;
       CPPMnode *vine;
       unsigned short int count;
       symbol sym;
       CPPMnode(symbol sym);
       CPPMnode();
+      ~CPPMnode();
       bool eq(CPPMnode *other, std::map<CPPMnode *,CPPMnode *> &equivs);
 	  };
     class ChildIterator {
+    private:
+      void nxt() {
+        if (m_ppChild == m_ppStop) return;
+        while ((--m_ppChild) != m_ppStop)
+          if (*m_ppChild) break;
+      }
     public:
-      bool operator==(const ChildIterator &other) const {return this->node == other.node;}
-      bool operator!=(const ChildIterator &other) const {return this->node != other.node;}
-      CPPMnode *operator*() {return node;};
-      ChildIterator &operator++() {node=node->next; return *this;} //prefix
-      ChildIterator operator++(int) {ChildIterator temp(*this); node=node->next; return temp;}
+      bool operator==(const ChildIterator &other) const {return m_ppChild==other.m_ppChild && m_ppStop == other.m_ppStop;}
+      bool operator!=(const ChildIterator &other) const {return m_ppChild!=other.m_ppChild || m_ppStop!=other.m_ppStop;}
+      CPPMnode *operator*() const {return (m_ppChild == m_ppStop) ? NULL : *m_ppChild;}
+      ChildIterator &operator++() {nxt(); return *this;} //prefix
+      ChildIterator operator++(int) {ChildIterator temp(*this); nxt(); return temp;}
       //operator CPPMnode *() {return node;} //implicit conversion
       //operator bool();                     //implicit conversion 2
-      ChildIterator(CPPMnode *_node);
+      ChildIterator(CPPMnode *const *ppChild, CPPMnode *const *ppStop) : m_ppChild(ppChild), m_ppStop(ppStop) {nxt();}
     private:
-      CPPMnode *node;
+      CPPMnode *const *m_ppChild, *const *m_ppStop;
     };
 
     class CPPMContext {
@@ -119,26 +135,37 @@ namespace Dasher {
   };
 
   /// @}
-  inline CPPMLanguageModel::ChildIterator::ChildIterator(CPPMnode *_node) : node(_node) {}
-
   inline CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::children() const {
-    return ChildIterator(this->child);
+    //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
+    CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
+    return ChildIterator(ppChild + abs(m_iNumChildSlots), ppChild - 1);
   }
   
-  inline CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::end() const {
-    static ChildIterator c(NULL);
-    return c;
+  inline const CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::end() const {
+    //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
+    CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
+    return ChildIterator(ppChild, ppChild - 1);
   }
 
-  inline Dasher::CPPMLanguageModel::CPPMnode::CPPMnode(symbol _sym):sym(_sym) {
-    child = next = vine = 0;
+  inline Dasher::CPPMLanguageModel::CPPMnode::CPPMnode(symbol _sym): sym(_sym) {
+    vine = 0;
+    m_iNumChildSlots = 0;
+    m_ppChildren = NULL;
     count = 1;
   }
 
   inline CPPMLanguageModel::CPPMnode::CPPMnode() {
-    child = next = vine = 0;
+    vine = 0;
+    m_iNumChildSlots = 0;
+    m_ppChildren = NULL;
     count = 1;
   }
+  
+  inline CPPMLanguageModel::CPPMnode::~CPPMnode() {
+    //single child = is direct pointer to node, not array...
+    if (m_iNumChildSlots != 1)
+      delete m_ppChildren;
+  }
 
   inline CLanguageModel::Context CPPMLanguageModel::CreateEmptyContext() {
     CPPMContext *pCont = m_ContextAlloc.Alloc();



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]