/********************************************************************** ** Copyright (C) 2000-2002 Trolltech AS. All rights reserved. ** ** This file is part of the Qtopia Environment. ** ** This file may be distributed and/or modified under the terms of the ** GNU General Public License version 2 as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL included in the ** packaging of this file. ** ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ** ** See http://www.trolltech.com/gpl/ for GPL licensing information. ** ** Contact info@trolltech.com if any conditions of this licensing are ** not clear to you. ** **********************************************************************/ #include "qdawg.h" #include <qintdict.h> #include <qfile.h> #include <qtl.h> #include <limits.h> #include <stdio.h> // for mmap #include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <fcntl.h> #include <errno.h> #include <unistd.h> class QDawgPrivate; class QTrie; typedef QValueList<QTrie*> TrieClub; typedef QIntDict<TrieClub> TrieClubDirectory; class TriePtr { public: QChar letter; QTrie* p; int operator <(const TriePtr& o) const; int operator >(const TriePtr& o) const; int operator <=(const TriePtr& o) const; }; class TrieList : public QValueList<TriePtr> { bool sorted; public: TrieList() { sorted=TRUE; } QTrie* findAdd(QChar c); bool equal(TrieList& l); void sort() { if ( !sorted ) { qHeapSort(*this); sorted = TRUE; } } }; // A fast but memory-wasting temporary class. The Dawg is the goal. class QTrie { public: QTrie(); ~QTrie(); void insertWord(const QString& s, uint index=0); bool equal(QTrie* o); void dump(int indent=0); private: TrieList children; bool isword; friend class QDawgPrivate; int maxdepth; int decendants; int key; void distributeKeys(TrieClubDirectory& directory); QTrie* clubLeader(TrieClubDirectory& directory); int collectKeys(); friend class TriePtr; friend class TrieList; }; QTrie::QTrie() { key = 0; isword = FALSE; } QTrie::~QTrie() { // NOTE: we do not delete the children - after conversion to DAWG // it's too difficult. The QTrie's are deleted via the directory. } void QTrie::insertWord(const QString& s, uint index) { if ( index == s.length() ) { isword = TRUE; } else { QTrie* t = children.findAdd(s[index]); t->insertWord(s,index+1); } } bool QTrie::equal(QTrie* o) { if ( o == this ) return TRUE; if ( isword != o->isword ) return FALSE; return children.equal(o->children); } void QTrie::dump(int indent) { for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { QTrie* s = (*it).p; for (int in=0; in<indent; in++) fputc(' ',stderr); fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(), s->key,s->isword?"word":"",s); s->dump(indent+2); } } void QTrie::distributeKeys(TrieClubDirectory& directory) { maxdepth = INT_MIN; decendants = children.count(); key = 0; for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { QTrie* s = (*it).p; QChar l = (*it).letter; s->distributeKeys(directory); key = key*64+l.unicode()+s->key*5; decendants += s->decendants; if ( s->maxdepth+1 > maxdepth ) maxdepth = s->maxdepth+1; } if ( decendants ) { key += decendants + maxdepth*256 + children.count() * 65536; if ( !key ) key++; // unlikely } TrieClub* c = directory[key]; if ( !c ) directory.insert(key, (c = new TrieClub) ); c->prepend(this); } QTrie* QTrie::clubLeader(TrieClubDirectory& directory) { if ( !key ) return directory[0]->first(); for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { QTrie* t= (*it).p->clubLeader(directory); (*it).p = t; } TrieClub *club = directory[key]; for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { QTrie* o = *it; if ( o->equal(this) ) return o; } return this; } int QTrie::collectKeys() { int n=0; if ( key ) key=0,n+=children.count(); for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) n += (*it).p->collectKeys(); return n; } int TriePtr::operator <(const TriePtr& o) const { return letter < o.letter; } int TriePtr::operator >(const TriePtr& o) const { return letter > o.letter; } int TriePtr::operator <=(const TriePtr& o) const { return letter <= o.letter; } bool TrieList::equal(TrieList& l) { if ( count() != l.count() ) return FALSE; sort(); l.sort(); ConstIterator it2 = begin(); ConstIterator it = l.begin(); for( ; it != l.end(); ++it, ++it2 ) if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) ) return FALSE; return TRUE; } QTrie* TrieList::findAdd(QChar c) { for (Iterator it=begin(); it!=end(); ++it) { if ( (*it).letter == c ) return (*it).p; } TriePtr p; p.p = new QTrie; p.letter = c; prepend(p); sorted=FALSE; sort(); return p.p; } static const char* dawg_sig = "QDAWG100"; class QDawgPrivate { public: QDawgPrivate(QIODevice* dev) { QDataStream ds(dev); char sig[8]; ds.readRawBytes(sig,8); if ( !strncmp(dawg_sig,sig,8) ) { uint n; char* nn; ds.readBytes(nn,n); // #### endianness problem ignored. node = (QDawg::Node*)nn; nodes = n / sizeof(QDawg::Node); } else { node = 0; } } bool ok() const { return node; } QDawgPrivate(uchar* mem) { if ( !strncmp(dawg_sig,(char*)mem,8) ) { mem += 8; int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3]; mem += 4; // #### endianness problem ignored. node = (QDawg::Node*)((char*)mem); nodes = n / sizeof(QDawg::Node); } } QDawgPrivate(QTrie* t) // destroys the QTrie. { TrieClubDirectory directory(9973); t->distributeKeys(directory); QTrie* l = t->clubLeader(directory); ASSERT(l==t); generateArray(t); TrieClub *club; for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit) { for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { delete *it; } delete club; } } bool write(QIODevice* dev) { QDataStream ds(dev); ds.writeRawBytes(dawg_sig,8); // #### endianness problem ignored. ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes); return dev->state() == IO_Ok; } void dumpWords(int nid=0, int index=0) { static char word[256]; // ick latin1 int i=0; do { QDawg::Node& n = node[nid+i]; word[index] = n.let; if ( n.isword ) fprintf(stderr,"%.*s\n",index+1,word); if ( n.offset ) dumpWords(n.offset+nid+i,index+1); } while (!node[nid+i++].islast); } void dump(int nid=0, int indent=0) { int i=0; do { QDawg::Node& n = node[nid+i]; fprintf(stderr,"%d: ",nid+i); for (int in=0; in<indent; in++) fputc(' ',stderr); fprintf(stderr," %c %d %d %d\n",n.let, n.isword,n.islast,n.offset); if ( n.offset ) dump(n.offset+nid+i,indent+2); } while (!node[nid+i++].islast); } int countWords(int nid=0) { int t=0; int i=0; do { QDawg::Node& n = node[nid+i]; if ( n.isword ) t++; if ( n.offset ) t+=countWords(n.offset+nid+i); } while (!node[nid+i++].islast); return t; } bool contains(const QString& s, int nid=0, int index=0) const { int i=0; do { QDawg::Node& n = node[nid+i]; if ( s[index] == QChar((ushort)n.let) ) { if ( n.isword && index == (int)s.length()-1 ) return TRUE; if ( n.offset ) return contains(s,n.offset+nid+i,index+1); } } while (!node[nid+i++].islast); return FALSE; } void appendAllWords(QStringList& list, int nid=0, QString s="") const { int i=0; int next = s.length(); do { QDawg::Node& n = node[nid+i]; s[next] = QChar((ushort)n.let); if ( n.isword ) list.append(s); if ( n.offset ) appendAllWords(list, n.offset+nid+i, s); } while (!node[nid+i++].islast); } const QDawg::Node* root() { return node; } private: void generateArray(QTrie* t) { nodes = 0; int n = t->collectKeys(); node = new QDawg::Node[n]; appendToArray(t); ASSERT(n == nodes); } int appendToArray(QTrie* t) { if ( !t->key ) { if ( !t->children.count() ) return 0; t->key = nodes; nodes += t->children.count(); QDawg::Node* n = &node[t->key-1]; int here = t->key; for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) { QTrie* s = (*it).p; ++n; n->let = (*it).letter.unicode(); n->isword = s->isword; n->islast = 0; n->offset = appendToArray(s); if ( n->offset ) { int t = n->offset-here; n->offset=t; if ( n->offset != t ) qWarning("Overflow: too many words"); } here++; } n->islast = 1; } return t->key; } private: int nodes; QDawg::Node *node; }; /*! \class QDawg qdawg.h \brief The QDawg class provides an implementation of a Directed Acyclic Word Graph. A DAWG provides very fast look-up of words in a word list. The word list is created using readFile(), read() or createFromWords(). A list of all the DAWG's words is returned by allWords(), and the total number of words is returned by countWords(). Use contains() to see if a particular word is in the DAWG. The root \link qdawg-node.html node\endlink is returned by root(). A global DAWG is maintained for the current locale. See the \l Global class for details. The structure of a DAWG is a graph of \link qdawg-node.html Nodes\endlink. There are no cycles in the graph (since there are no inifinitely repeating words). Each \link qdawg-node.html Node\endlink is a member of a list of \link qdawg-node.html Nodes\endlink called a child list. Each \link qdawg-node.html Node\endlink in the child list has a \e letter, an \e isWord flag, at most one \e jump arc, and at most one arc to the next child in the list. If you traverse the \link qdawg-node.html Nodes\endlink in a DAWG, starting from the root(), and you concatenate all the letters from the single child in each child list that you visit, at every \link qdawg-node.html Node\endlink which has the isWord flag set your concatenation will be a word in the list represented by the DAWG. For example, the DAWG below represents the word list: ban, band, can, cane, cans, pan, pane, pans. This structuring not only provides O(1) lookup of words in the word list, but also produces a smaller storage file than a plain text file word list. \img qdawg.png */ /*! Constructs a new empty DAWG. */ QDawg::QDawg() { d = 0; } /*! Deletes the DAWG. */ QDawg::~QDawg() { delete d; } /*! \overload Replaces all the DAWG's words with words read from \a dev. */ bool QDawg::createFromWords(QIODevice* dev) { delete d; QTextStream i(dev); QTrie* trie = new QTrie; int n=0; while (!i.atEnd()) { trie->insertWord(QString::fromUtf8(i.readLine())); n++; } if ( n ) d = new QDawgPrivate(trie); else d = 0; return TRUE; } /*! Replaces all the DAWG's words with the words in the \a list. */ void QDawg::createFromWords(const QStringList& list) { delete d; if ( list.count() ) { QTrie* trie = new QTrie; for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) { trie->insertWord(*it); } d = new QDawgPrivate(trie); } else { d = 0; } } /*! Returns a list of all the words in the DAWG. */ QStringList QDawg::allWords() const { QStringList result; if ( d ) d->appendAllWords(result); return result; } /*! Replaces the DAWG with the DAWG in \a filename. The file is memory-mapped. \sa write() */ bool QDawg::readFile(const QString& filename) { delete d; d = 0; int f = ::open( QFile::encodeName(filename), O_RDONLY ); if ( f < 0 ) return FALSE; struct stat st; if ( !fstat( f, &st ) ) { char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file PROT_READ, // read-only memory MAP_FILE | MAP_PRIVATE, // swap-backed map from file f, 0 ); // from offset 0 of f if ( tmp && tmp != (char*)MAP_FAILED ) d = new QDawgPrivate((uchar*)tmp); } ::close( f ); return d; } /*! Replaces the DAWG with the DAWG in \a dev. The file is memory-mapped. \sa write() */ bool QDawg::read(QIODevice* dev) { delete d; d = new QDawgPrivate(dev); if ( d->ok() ) return TRUE; delete d; d = 0; return FALSE; } /*! Writes the DAWG to \a dev, in a custom QDAWG format. */ bool QDawg::write(QIODevice* dev) const { return d ? d->write(dev) : TRUE; } /*! Returns the number of words in the DAWG. */ int QDawg::countWords() const { return d ? d->countWords() : 0; } /*! Returns the root \link qdawg-node.html Node\endlink of the DAWG. */ const QDawg::Node* QDawg::root() const { return d ? d->root() : 0; } /*! Returns TRUE if the DAWG contains the word \a s; otherwise returns FALSE. */ bool QDawg::contains(const QString& s) const { return d ? d->contains(s) : FALSE; } /*! \internal For debugging: prints out the DAWG contents. */ void QDawg::dump() const { if ( d ) d->dump(); } /*! \class QDawg::Node qdawg.h \brief The QDawg::Node class represents one node of a QDawg. */ /*! \fn QChar QDawg::Node::letter() const Returns this Node's letter. */ /*! \fn bool QDawg::Node::isWord() const Returns TRUE if this Node is the end of a word; otherwise returns FALSE. */ /*! \fn bool QDawg::Node::isLast() const Returns TRUE if this Node is the last in the child list; otherwise returns FALSE. */ /*! \fn const Node* QDawg::Node::next() const Returns the next child Node in the child list or 0 if the current Node isLast(). */ /*! \fn const Node* QDawg::Node::jump() const Returns the node connected to this Node. */