[dasher: 8/16] Store PPMnode's children in array/hash
- From: Patrick Welche <pwelche src gnome org>
- To: svn-commits-list gnome org
- Cc:
- Subject: [dasher: 8/16] Store PPMnode's children in array/hash
- Date: Tue, 1 Dec 2009 16:14:48 +0000 (UTC)
commit bbce9fa27e57104d63101ca003fb0c65a1e34b36
Author: Alan Lawrence <acl33 inf phy cam ac uk>
Date: Wed Nov 4 18:10:59 2009 +0000
Store PPMnode's children in array/hash
Single child stored as direct pointer; then brute-force through unordered array;
then closed hash-table; then direct indexing. (Chosen according to #children).
.../LanguageModelling/PPMLanguageModel.cpp | 91 ++++++++++++++++++--
.../LanguageModelling/PPMLanguageModel.h | 69 ++++++++++-----
2 files changed, 131 insertions(+), 29 deletions(-)
---
diff --git a/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp b/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
index 0f5b1c8..39b47ec 100644
--- a/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
+++ b/Src/DasherCore/LanguageModelling/PPMLanguageModel.cpp
@@ -10,6 +10,7 @@
#include "PPMLanguageModel.h"
#include <math.h>
+#include <string.h>
#include <stack>
#include <sstream>
#include <iostream>
@@ -368,21 +369,95 @@ bool CPPMLanguageModel::CPPMnode::eq(CPPMLanguageModel::CPPMnode *other, std::ma
return true;
}
+#define MAX_RUN 4
+
CPPMLanguageModel::CPPMnode * CPPMLanguageModel::CPPMnode::find_symbol(symbol sym) const
// see if symbol is a child of node
{
+ if (m_iNumChildSlots < 0) //negative to mean "full alphabet", use direct indexing
+ return m_ppChildren[sym];
+ if (m_iNumChildSlots == 1) {
+ if (m_pChild->sym == sym)
+ return m_pChild;
+ return 0;
+ }
+ if (m_iNumChildSlots <= MAX_RUN) {
+ for (int i = 0; i < m_iNumChildSlots && m_ppChildren[i]; i++)
+ if (m_ppChildren[i]->sym == sym) return m_ppChildren[i];
+ return 0;
+ }
// printf("finding symbol %d at node %d\n",sym,node->id);
- for (ChildIterator search = children(); search != end(); search++) {
- if((*search)->sym == sym) {
- return *search;
+
+ for (int i = sym; ; i++) { //search through elements which have overflowed into subsequent slots
+ CPPMnode *found = this->m_ppChildren[i % m_iNumChildSlots]; //wrap round
+ if (!found) return 0; //null element
+ if(found->sym == sym) {
+ return found;
}
}
return 0;
}
-void CPPMLanguageModel::CPPMnode::AddChild(CPPMnode *pNewChild) {
- pNewChild->next = this->child;
- this->child = pNewChild;
+void CPPMLanguageModel::CPPMnode::AddChild(CPPMnode *pNewChild, int numSymbols) {
+ if (m_iNumChildSlots < 0) {
+ m_ppChildren[pNewChild->sym] = pNewChild;
+ }
+ else
+ {
+ if (m_iNumChildSlots == 0) {
+ m_iNumChildSlots = 1;
+ m_pChild = pNewChild;
+ return;
+ } else if (m_iNumChildSlots == 1) {
+ //no room, have to resize...
+ } else if (m_iNumChildSlots<=MAX_RUN) {
+ for (int i = 0; i < m_iNumChildSlots; i++)
+ if (!m_ppChildren[i]) {
+ m_ppChildren[i] = pNewChild;
+ return;
+ }
+ } else {
+
+
+ int start = pNewChild->sym;
+ //find length of run (including to-be-inserted element)....
+ while (m_ppChildren[start = (start + m_iNumChildSlots - 1) % m_iNumChildSlots]);
+
+ int idx = pNewChild->sym;
+ while (m_ppChildren[idx %= m_iNumChildSlots]) ++idx;
+ //found NULL
+ int stop = idx;
+ while (m_ppChildren[stop = (stop + 1) % m_iNumChildSlots]);
+ //start and idx point to NULLs (with inserted element somewhere inbetween)
+
+ int runLen = (m_iNumChildSlots + stop - (start+1)) % m_iNumChildSlots;
+ if (runLen <= MAX_RUN) {
+ //ok, maintain size
+ m_ppChildren[idx] = pNewChild;
+ return;
+ }
+ }
+ //resize!
+ CPPMnode **oldChildren = m_ppChildren;
+ int oldSlots = m_iNumChildSlots;
+ int newNumElems;
+ if (m_iNumChildSlots >= numSymbols/4) {
+ m_iNumChildSlots = -numSymbols; // negative = "use direct indexing"
+ newNumElems = numSymbols;
+ } else {
+ m_iNumChildSlots+=m_iNumChildSlots+1;
+ newNumElems = m_iNumChildSlots;
+ }
+ m_ppChildren = new CPPMnode *[newNumElems]; //null terminator
+ memset (m_ppChildren, 0, sizeof(CPPMnode *)*newNumElems);
+ if (oldSlots == 1)
+ AddChild((CPPMnode *)oldChildren, numSymbols);
+ else {
+ while (oldSlots-- > 0) if (oldChildren[oldSlots]) AddChild(oldChildren[oldSlots], numSymbols);
+ delete oldChildren;
+ }
+ AddChild(pNewChild, numSymbols);
+ }
}
CPPMLanguageModel::CPPMnode * CPPMLanguageModel::AddSymbolToNode(CPPMnode *pNode, symbol sym, int *update) {
@@ -405,7 +480,7 @@ CPPMLanguageModel::CPPMnode * CPPMLanguageModel::AddSymbolToNode(CPPMnode *pNode
pReturn = m_NodeAlloc.Alloc(); // count is initialized to 1
pReturn->sym = sym;
- pNode->AddChild(pReturn);
+ pNode->AddChild(pReturn,GetSize());
++NodesAllocated;
@@ -509,7 +584,7 @@ bool CPPMLanguageModel::ReadFromFile(std::string strFilename) {
std::map<int,CPPMnode *>::iterator it(parentMap.find(sBR.m_iIndex));
if (it != parentMap.end()) {
CPPMnode *parent = it->second;
- parent->AddChild(pCurrent);
+ parent->AddChild(pCurrent,GetSize());
//erase the record of parent hood, now we've realized it
parentMap.erase(it);
//add mapping for the _next_ sibling; since siblings will be read in the order
diff --git a/Src/DasherCore/LanguageModelling/PPMLanguageModel.h b/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
index 6445900..4edac69 100644
--- a/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
+++ b/Src/DasherCore/LanguageModelling/PPMLanguageModel.h
@@ -14,6 +14,7 @@
#include "LanguageModel.h"
+#include "stdlib.h"
#include <vector>
#include <fstream>
#include <set>
@@ -31,33 +32,48 @@ namespace Dasher {
class ChildIterator;
class CPPMnode {
private:
- CPPMnode *child;
- CPPMnode *next;
- friend class ChildIterator;
+ union {
+ CPPMnode **m_ppChildren;
+ CPPMnode *m_pChild;
+ };
+ ///Elements in above array, including nulls, as follows:
+ /// (a) negative -> absolute value is number of elems in m_ppChildren, but use direct indexing
+ /// (b) 1 -> use m_pChild as direct pointer to CPPMnode (no array)
+ /// (c) 2-MAX_RUN -> m_ppChildren is unordered array of that many elems
+ /// (d) >MAX_RUN -> m_ppChildren is an inline hash (overflow to next elem) with that many slots
+ int m_iNumChildSlots;
+ friend class CPPMLanguageModel;
public:
ChildIterator children() const;
- ChildIterator end() const;
- void AddChild(CPPMnode *pNewChild);
+ const ChildIterator end() const;
+ void AddChild(CPPMnode *pNewChild, int numSymbols);
CPPMnode * find_symbol(symbol sym)const;
CPPMnode *vine;
unsigned short int count;
symbol sym;
CPPMnode(symbol sym);
CPPMnode();
+ ~CPPMnode();
bool eq(CPPMnode *other, std::map<CPPMnode *,CPPMnode *> &equivs);
};
class ChildIterator {
+ private:
+ void nxt() {
+ if (m_ppChild == m_ppStop) return;
+ while ((--m_ppChild) != m_ppStop)
+ if (*m_ppChild) break;
+ }
public:
- bool operator==(const ChildIterator &other) const {return this->node == other.node;}
- bool operator!=(const ChildIterator &other) const {return this->node != other.node;}
- CPPMnode *operator*() {return node;};
- ChildIterator &operator++() {node=node->next; return *this;} //prefix
- ChildIterator operator++(int) {ChildIterator temp(*this); node=node->next; return temp;}
+ bool operator==(const ChildIterator &other) const {return m_ppChild==other.m_ppChild && m_ppStop == other.m_ppStop;}
+ bool operator!=(const ChildIterator &other) const {return m_ppChild!=other.m_ppChild || m_ppStop!=other.m_ppStop;}
+ CPPMnode *operator*() const {return (m_ppChild == m_ppStop) ? NULL : *m_ppChild;}
+ ChildIterator &operator++() {nxt(); return *this;} //prefix
+ ChildIterator operator++(int) {ChildIterator temp(*this); nxt(); return temp;}
//operator CPPMnode *() {return node;} //implicit conversion
//operator bool(); //implicit conversion 2
- ChildIterator(CPPMnode *_node);
+ ChildIterator(CPPMnode *const *ppChild, CPPMnode *const *ppStop) : m_ppChild(ppChild), m_ppStop(ppStop) {nxt();}
private:
- CPPMnode *node;
+ CPPMnode *const *m_ppChild, *const *m_ppStop;
};
class CPPMContext {
@@ -119,26 +135,37 @@ namespace Dasher {
};
/// @}
- inline CPPMLanguageModel::ChildIterator::ChildIterator(CPPMnode *_node) : node(_node) {}
-
inline CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::children() const {
- return ChildIterator(this->child);
+ //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
+ CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
+ return ChildIterator(ppChild + abs(m_iNumChildSlots), ppChild - 1);
}
- inline CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::end() const {
- static ChildIterator c(NULL);
- return c;
+ inline const CPPMLanguageModel::ChildIterator CPPMLanguageModel::CPPMnode::end() const {
+ //if m_iNumChildSlots = 0 / 1, m_ppChildren is direct pointer, else ptr to array (of pointers)
+ CPPMnode *const *ppChild = (m_iNumChildSlots == 0 || m_iNumChildSlots == 1) ? &m_pChild : m_ppChildren;
+ return ChildIterator(ppChild, ppChild - 1);
}
- inline Dasher::CPPMLanguageModel::CPPMnode::CPPMnode(symbol _sym):sym(_sym) {
- child = next = vine = 0;
+ inline Dasher::CPPMLanguageModel::CPPMnode::CPPMnode(symbol _sym): sym(_sym) {
+ vine = 0;
+ m_iNumChildSlots = 0;
+ m_ppChildren = NULL;
count = 1;
}
inline CPPMLanguageModel::CPPMnode::CPPMnode() {
- child = next = vine = 0;
+ vine = 0;
+ m_iNumChildSlots = 0;
+ m_ppChildren = NULL;
count = 1;
}
+
+ inline CPPMLanguageModel::CPPMnode::~CPPMnode() {
+ //single child = is direct pointer to node, not array...
+ if (m_iNumChildSlots != 1)
+ delete m_ppChildren;
+ }
inline CLanguageModel::Context CPPMLanguageModel::CreateEmptyContext() {
CPPMContext *pCont = m_ContextAlloc.Alloc();
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]