summaryrefslogtreecommitdiff
path: root/library/qdawg.cpp
Side-by-side diff
Diffstat (limited to 'library/qdawg.cpp') (more/less context) (show whitespace changes)
-rw-r--r--library/qdawg.cpp121
1 files changed, 119 insertions, 2 deletions
diff --git a/library/qdawg.cpp b/library/qdawg.cpp
index 3615693..af5dc82 100644
--- a/library/qdawg.cpp
+++ b/library/qdawg.cpp
@@ -1,510 +1,627 @@
/**********************************************************************
-** Copyright (C) 2000 Trolltech AS. All rights reserved.
+** Copyright (C) 2000-2002 Trolltech AS. All rights reserved.
**
-** This file is part of Qtopia Environment.
+** This file is part of the Qtopia Environment.
**
** This file may be distributed and/or modified under the terms of the
** GNU General Public License version 2 as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL included in the
** packaging of this file.
**
** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
**
** See http://www.trolltech.com/gpl/ for GPL licensing information.
**
** Contact info@trolltech.com if any conditions of this licensing are
** not clear to you.
**
**********************************************************************/
#include "qdawg.h"
#include <qintdict.h>
#include <qvaluelist.h>
#include <qtextstream.h>
#include <qfile.h>
#include <qtl.h>
#include <limits.h>
#include <stdio.h>
// for mmap
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
class QDawgPrivate;
class QTrie;
typedef QValueList<QTrie*> TrieClub;
typedef QIntDict<TrieClub> TrieClubDirectory;
class TriePtr {
public:
QChar letter;
QTrie* p;
int operator <(const TriePtr& o) const;
int operator >(const TriePtr& o) const;
int operator <=(const TriePtr& o) const;
};
class TrieList : public QValueList<TriePtr> {
bool sorted;
public:
TrieList()
{
sorted=TRUE;
}
QTrie* findAdd(QChar c);
bool equal(TrieList& l);
void sort()
{
if ( !sorted ) {
qHeapSort(*this);
sorted = TRUE;
}
}
};
// A fast but memory-wasting temporary class. The Dawg is the goal.
class QTrie {
public:
QTrie();
~QTrie();
void insertWord(const QString& s, uint index=0);
bool equal(QTrie* o);
void dump(int indent=0);
private:
TrieList children;
bool isword;
friend class QDawgPrivate;
int maxdepth;
int decendants;
int key;
void distributeKeys(TrieClubDirectory& directory);
QTrie* clubLeader(TrieClubDirectory& directory);
int collectKeys();
friend class TriePtr;
friend class TrieList;
};
QTrie::QTrie()
{
key = 0;
isword = FALSE;
}
QTrie::~QTrie()
{
// NOTE: we do not delete the children - after conversion to DAWG
// it's too difficult. The QTrie's are deleted via the directory.
}
void QTrie::insertWord(const QString& s, uint index)
{
if ( index == s.length() ) {
isword = TRUE;
} else {
QTrie* t = children.findAdd(s[index]);
t->insertWord(s,index+1);
}
}
bool QTrie::equal(QTrie* o)
{
if ( o == this ) return TRUE;
if ( isword != o->isword )
return FALSE;
return children.equal(o->children);
}
void QTrie::dump(int indent)
{
for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
QTrie* s = (*it).p;
for (int in=0; in<indent; in++)
fputc(' ',stderr);
fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(),
s->key,s->isword?"word":"",s);
s->dump(indent+2);
}
}
void QTrie::distributeKeys(TrieClubDirectory& directory)
{
maxdepth = INT_MIN;
decendants = children.count();
key = 0;
for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
QTrie* s = (*it).p;
QChar l = (*it).letter;
s->distributeKeys(directory);
key = key*64+l.unicode()+s->key*5;
decendants += s->decendants;
if ( s->maxdepth+1 > maxdepth )
maxdepth = s->maxdepth+1;
}
if ( decendants ) {
key += decendants + maxdepth*256 + children.count() * 65536;
if ( !key ) key++; // unlikely
}
TrieClub* c = directory[key];
if ( !c ) directory.insert(key, (c = new TrieClub) );
c->prepend(this);
}
QTrie* QTrie::clubLeader(TrieClubDirectory& directory)
{
if ( !key ) return directory[0]->first();
for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) {
QTrie* t= (*it).p->clubLeader(directory);
(*it).p = t;
}
TrieClub *club = directory[key];
for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
QTrie* o = *it;
if ( o->equal(this) )
return o;
}
return this;
}
int QTrie::collectKeys()
{
int n=0;
if ( key ) key=0,n+=children.count();
for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it)
n += (*it).p->collectKeys();
return n;
}
int TriePtr::operator <(const TriePtr& o) const
{ return letter < o.letter; }
int TriePtr::operator >(const TriePtr& o) const
{ return letter > o.letter; }
int TriePtr::operator <=(const TriePtr& o) const
{ return letter <= o.letter; }
bool TrieList::equal(TrieList& l)
{
if ( count() != l.count() )
return FALSE;
sort(); l.sort();
ConstIterator it2 = begin();
ConstIterator it = l.begin();
for( ; it != l.end(); ++it, ++it2 )
if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) )
return FALSE;
return TRUE;
}
QTrie* TrieList::findAdd(QChar c)
{
for (Iterator it=begin(); it!=end(); ++it) {
if ( (*it).letter == c )
return (*it).p;
}
TriePtr p;
p.p = new QTrie;
p.letter = c;
prepend(p);
sorted=FALSE;
sort();
return p.p;
}
static const char* dawg_sig = "QDAWG100";
class QDawgPrivate {
public:
QDawgPrivate(QIODevice* dev)
{
QDataStream ds(dev);
char sig[8];
ds.readRawBytes(sig,8);
if ( !strncmp(dawg_sig,sig,8) ) {
uint n;
char* nn;
ds.readBytes(nn,n);
// #### endianness problem ignored.
node = (QDawg::Node*)nn;
nodes = n / sizeof(QDawg::Node);
} else {
node = 0;
}
}
bool ok() const { return node; }
QDawgPrivate(uchar* mem)
{
if ( !strncmp(dawg_sig,(char*)mem,8) ) {
mem += 8;
int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3];
mem += 4;
// #### endianness problem ignored.
node = (QDawg::Node*)((char*)mem);
nodes = n / sizeof(QDawg::Node);
}
}
QDawgPrivate(QTrie* t) // destroys the QTrie.
{
TrieClubDirectory directory(9973);
t->distributeKeys(directory);
QTrie* l = t->clubLeader(directory);
ASSERT(l==t);
generateArray(t);
TrieClub *club;
for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit)
{
for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) {
delete *it;
}
delete club;
}
}
bool write(QIODevice* dev)
{
QDataStream ds(dev);
ds.writeRawBytes(dawg_sig,8);
// #### endianness problem ignored.
ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes);
return dev->state() == IO_Ok;
}
void dumpWords(int nid=0, int index=0)
{
static char word[256]; // ick latin1
int i=0;
do {
QDawg::Node& n = node[nid+i];
word[index] = n.let;
if ( n.isword )
fprintf(stderr,"%.*s\n",index+1,word);
if ( n.offset ) dumpWords(n.offset+nid+i,index+1);
} while (!node[nid+i++].islast);
}
void dump(int nid=0, int indent=0)
{
int i=0;
do {
QDawg::Node& n = node[nid+i];
fprintf(stderr,"%d: ",nid+i);
for (int in=0; in<indent; in++)
fputc(' ',stderr);
fprintf(stderr," %c %d %d %d\n",n.let,
n.isword,n.islast,n.offset);
if ( n.offset ) dump(n.offset+nid+i,indent+2);
} while (!node[nid+i++].islast);
}
int countWords(int nid=0)
{
int t=0;
int i=0;
do {
QDawg::Node& n = node[nid+i];
if ( n.isword )
t++;
if ( n.offset )
t+=countWords(n.offset+nid+i);
} while (!node[nid+i++].islast);
return t;
}
bool contains(const QString& s, int nid=0, int index=0) const
{
int i=0;
do {
QDawg::Node& n = node[nid+i];
if ( s[index] == QChar((ushort)n.let) ) {
if ( n.isword && index == (int)s.length()-1 )
return TRUE;
if ( n.offset )
return contains(s,n.offset+nid+i,index+1);
}
} while (!node[nid+i++].islast);
return FALSE;
}
void appendAllWords(QStringList& list, int nid=0, QString s="") const
{
int i=0;
int next = s.length();
do {
QDawg::Node& n = node[nid+i];
s[next] = QChar((ushort)n.let);
if ( n.isword )
list.append(s);
if ( n.offset )
appendAllWords(list, n.offset+nid+i, s);
} while (!node[nid+i++].islast);
}
const QDawg::Node* root() { return node; }
private:
void generateArray(QTrie* t)
{
nodes = 0;
int n = t->collectKeys();
node = new QDawg::Node[n];
appendToArray(t);
ASSERT(n == nodes);
}
int appendToArray(QTrie* t)
{
if ( !t->key ) {
if ( !t->children.count() )
return 0;
t->key = nodes;
nodes += t->children.count();
QDawg::Node* n = &node[t->key-1];
int here = t->key;
for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) {
QTrie* s = (*it).p;
++n;
n->let = (*it).letter.unicode();
n->isword = s->isword;
n->islast = 0;
n->offset = appendToArray(s);
if ( n->offset ) {
int t = n->offset-here;
n->offset=t;
if ( n->offset != t )
qWarning("Overflow: too many words");
}
here++;
}
n->islast = 1;
}
return t->key;
}
private:
int nodes;
QDawg::Node *node;
};
+/*!
+ \class QDawg qdawg.h
+ \brief The QDawg class provides an implementation of a Directed Acyclic Word Graph.
+ A DAWG provides very fast look-up of words in a word list.
+
+ The word list is created using readFile(), read() or
+ createFromWords(). A list of all the DAWG's words is returned by
+ allWords(), and the total number of words is returned by
+ countWords(). Use contains() to see if a particular word is in the
+ DAWG. The root \link qdawg-node.html node\endlink is returned by root().
+
+ A global DAWG is maintained for the current locale. See the
+ \l Global class for details.
+
+ The structure of a DAWG is a graph of \link qdawg-node.html
+ Nodes\endlink. There are no cycles in the graph (since there are no
+ inifinitely repeating words). Each \link qdawg-node.html
+ Node\endlink is a member of a list of \link qdawg-node.html
+ Nodes\endlink called a child list. Each \link qdawg-node.html
+ Node\endlink in the child list has a \e letter, an \e isWord flag,
+ at most one \e jump arc, and at most one arc to the next child in
+ the list.
+
+ If you traverse the \link qdawg-node.html Nodes\endlink in a DAWG,
+ starting from the root(), and you concatenate all the letters from
+ the single child in each child list that you visit, at every \link
+ qdawg-node.html Node\endlink which has the isWord flag set your
+ concatenation will be a word in the list represented by the DAWG.
+
+ For example, the DAWG below represents the word list:
+ ban, band, can, cane, cans, pan, pane, pans.
+
+ This structuring not only provides O(1) lookup of words in the word list,
+ but also produces a smaller storage file than a plain text file word list.
+
+ \img qdawg.png
+*/
+
+/*!
+ Constructs a new empty DAWG.
+*/
QDawg::QDawg()
{
d = 0;
}
+/*!
+ Deletes the DAWG.
+*/
QDawg::~QDawg()
{
delete d;
}
+/*!
+ \overload
+ Replaces all the DAWG's words with words read from \a dev.
+*/
bool QDawg::createFromWords(QIODevice* dev)
{
delete d;
QTextStream i(dev);
QTrie* trie = new QTrie;
int n=0;
while (!i.atEnd()) {
trie->insertWord(QString::fromUtf8(i.readLine()));
n++;
}
if ( n )
d = new QDawgPrivate(trie);
else
d = 0;
return TRUE;
}
+/*!
+ Replaces all the DAWG's words with the words in the \a list.
+*/
void QDawg::createFromWords(const QStringList& list)
{
delete d;
if ( list.count() ) {
QTrie* trie = new QTrie;
for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) {
trie->insertWord(*it);
}
d = new QDawgPrivate(trie);
} else {
d = 0;
}
}
+/*!
+ Returns a list of all the words in the DAWG.
+*/
QStringList QDawg::allWords() const
{
QStringList result;
if ( d ) d->appendAllWords(result);
return result;
}
+/*!
+ Replaces the DAWG with the DAWG in \a filename.
+ The file is memory-mapped.
+
+ \sa write()
+*/
bool QDawg::readFile(const QString& filename)
{
delete d;
d = 0;
int f = ::open( QFile::encodeName(filename), O_RDONLY );
if ( f < 0 )
return FALSE;
struct stat st;
if ( !fstat( f, &st ) ) {
char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file
PROT_READ, // read-only memory
MAP_FILE | MAP_PRIVATE, // swap-backed map from file
f, 0 ); // from offset 0 of f
if ( tmp && tmp != (char*)MAP_FAILED )
d = new QDawgPrivate((uchar*)tmp);
}
::close( f );
return d;
}
+/*!
+ Replaces the DAWG with the DAWG in \a dev.
+ The file is memory-mapped.
+
+ \sa write()
+*/
bool QDawg::read(QIODevice* dev)
{
delete d;
d = new QDawgPrivate(dev);
if ( d->ok() )
return TRUE;
delete d;
d = 0;
return FALSE;
}
+/*!
+ Writes the DAWG to \a dev, in a custom QDAWG format.
+*/
bool QDawg::write(QIODevice* dev) const
{
return d ? d->write(dev) : TRUE;
}
+/*!
+ Returns the number of words in the DAWG.
+*/
int QDawg::countWords() const
{
return d ? d->countWords() : 0;
}
+/*!
+ Returns the root \link qdawg-node.html Node\endlink of the DAWG.
+*/
const QDawg::Node* QDawg::root() const
{
return d ? d->root() : 0;
}
+/*!
+ Returns TRUE if the DAWG contains the word \a s; otherwise returns
+ FALSE.
+*/
bool QDawg::contains(const QString& s) const
{
return d ? d->contains(s) : FALSE;
}
+/*!
+ \internal
+
+ For debugging: prints out the DAWG contents.
+*/
void QDawg::dump() const
{
if ( d ) d->dump();
}
+/*!
+ \class QDawg::Node qdawg.h
+ \brief The QDawg::Node class represents one node of a QDawg.
+*/
+
+/*!
+ \fn QChar QDawg::Node::letter() const
+
+ Returns this Node's letter.
+*/
+/*!
+ \fn bool QDawg::Node::isWord() const
+
+ Returns TRUE if this Node is the end of a word; otherwise returns
+ FALSE.
+*/
+/*!
+ \fn bool QDawg::Node::isLast() const
+
+ Returns TRUE if this Node is the last in the child list; otherwise
+ returns FALSE.
+*/
+/*!
+ \fn const Node* QDawg::Node::next() const
+
+ Returns the next child Node in the child list or 0 if the current
+ Node isLast().
+*/
+/*!
+ \fn const Node* QDawg::Node::jump() const
+
+ Returns the node connected to this Node.
+*/