author | kergoth <kergoth> | 2002-01-25 22:14:26 (UTC) |
---|---|---|
committer | kergoth <kergoth> | 2002-01-25 22:14:26 (UTC) |
commit | 15318cad33835e4e2dc620d033e43cd930676cdd (patch) (unidiff) | |
tree | c2fa0399a2c47fda8e2cd0092c73a809d17f68eb /library/qdawg.cpp | |
download | opie-15318cad33835e4e2dc620d033e43cd930676cdd.zip opie-15318cad33835e4e2dc620d033e43cd930676cdd.tar.gz opie-15318cad33835e4e2dc620d033e43cd930676cdd.tar.bz2 |
Initial revision
-rw-r--r-- | library/qdawg.cpp | 510 |
1 files changed, 510 insertions, 0 deletions
diff --git a/library/qdawg.cpp b/library/qdawg.cpp new file mode 100644 index 0000000..3615693 --- a/dev/null +++ b/library/qdawg.cpp | |||
@@ -0,0 +1,510 @@ | |||
1 | /********************************************************************** | ||
2 | ** Copyright (C) 2000 Trolltech AS. All rights reserved. | ||
3 | ** | ||
4 | ** This file is part of Qtopia Environment. | ||
5 | ** | ||
6 | ** This file may be distributed and/or modified under the terms of the | ||
7 | ** GNU General Public License version 2 as published by the Free Software | ||
8 | ** Foundation and appearing in the file LICENSE.GPL included in the | ||
9 | ** packaging of this file. | ||
10 | ** | ||
11 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE | ||
12 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | ||
13 | ** | ||
14 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. | ||
15 | ** | ||
16 | ** Contact info@trolltech.com if any conditions of this licensing are | ||
17 | ** not clear to you. | ||
18 | ** | ||
19 | **********************************************************************/ | ||
20 | #include "qdawg.h" | ||
21 | #include <qintdict.h> | ||
22 | #include <qvaluelist.h> | ||
23 | #include <qtextstream.h> | ||
24 | #include <qfile.h> | ||
25 | #include <qtl.h> | ||
26 | |||
27 | #include <limits.h> | ||
28 | #include <stdio.h> | ||
29 | |||
30 | // for mmap | ||
31 | #include <sys/types.h> | ||
32 | #include <sys/stat.h> | ||
33 | #include <sys/mman.h> | ||
34 | #include <fcntl.h> | ||
35 | #include <errno.h> | ||
36 | #include <unistd.h> | ||
37 | |||
38 | class QDawgPrivate; | ||
39 | class QTrie; | ||
40 | |||
41 | typedef QValueList<QTrie*> TrieClub; | ||
42 | typedef QIntDict<TrieClub> TrieClubDirectory; | ||
43 | |||
44 | class TriePtr { | ||
45 | public: | ||
46 | QChar letter; | ||
47 | QTrie* p; | ||
48 | int operator <(const TriePtr& o) const; | ||
49 | int operator >(const TriePtr& o) const; | ||
50 | int operator <=(const TriePtr& o) const; | ||
51 | }; | ||
52 | |||
53 | class TrieList : public QValueList<TriePtr> { | ||
54 | bool sorted; | ||
55 | public: | ||
56 | TrieList() | ||
57 | { | ||
58 | sorted=TRUE; | ||
59 | } | ||
60 | |||
61 | QTrie* findAdd(QChar c); | ||
62 | bool equal(TrieList& l); | ||
63 | |||
64 | void sort() | ||
65 | { | ||
66 | if ( !sorted ) { | ||
67 | qHeapSort(*this); | ||
68 | sorted = TRUE; | ||
69 | } | ||
70 | } | ||
71 | }; | ||
72 | |||
73 | // A fast but memory-wasting temporary class. The Dawg is the goal. | ||
74 | class QTrie { | ||
75 | public: | ||
76 | QTrie(); | ||
77 | ~QTrie(); | ||
78 | |||
79 | void insertWord(const QString& s, uint index=0); | ||
80 | bool equal(QTrie* o); | ||
81 | void dump(int indent=0); | ||
82 | |||
83 | private: | ||
84 | TrieList children; | ||
85 | bool isword; | ||
86 | |||
87 | friend class QDawgPrivate; | ||
88 | int maxdepth; | ||
89 | int decendants; | ||
90 | int key; | ||
91 | void distributeKeys(TrieClubDirectory& directory); | ||
92 | QTrie* clubLeader(TrieClubDirectory& directory); | ||
93 | int collectKeys(); | ||
94 | friend class TriePtr; | ||
95 | friend class TrieList; | ||
96 | }; | ||
97 | |||
98 | QTrie::QTrie() | ||
99 | { | ||
100 | key = 0; | ||
101 | isword = FALSE; | ||
102 | } | ||
103 | |||
104 | QTrie::~QTrie() | ||
105 | { | ||
106 | // NOTE: we do not delete the children - after conversion to DAWG | ||
107 | // it's too difficult. The QTrie's are deleted via the directory. | ||
108 | } | ||
109 | |||
110 | void QTrie::insertWord(const QString& s, uint index) | ||
111 | { | ||
112 | if ( index == s.length() ) { | ||
113 | isword = TRUE; | ||
114 | } else { | ||
115 | QTrie* t = children.findAdd(s[index]); | ||
116 | t->insertWord(s,index+1); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | bool QTrie::equal(QTrie* o) | ||
121 | { | ||
122 | if ( o == this ) return TRUE; | ||
123 | if ( isword != o->isword ) | ||
124 | return FALSE; | ||
125 | return children.equal(o->children); | ||
126 | } | ||
127 | |||
128 | void QTrie::dump(int indent) | ||
129 | { | ||
130 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | ||
131 | QTrie* s = (*it).p; | ||
132 | for (int in=0; in<indent; in++) | ||
133 | fputc(' ',stderr); | ||
134 | fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(), | ||
135 | s->key,s->isword?"word":"",s); | ||
136 | s->dump(indent+2); | ||
137 | } | ||
138 | } | ||
139 | |||
140 | void QTrie::distributeKeys(TrieClubDirectory& directory) | ||
141 | { | ||
142 | maxdepth = INT_MIN; | ||
143 | decendants = children.count(); | ||
144 | key = 0; | ||
145 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | ||
146 | QTrie* s = (*it).p; | ||
147 | QChar l = (*it).letter; | ||
148 | s->distributeKeys(directory); | ||
149 | key = key*64+l.unicode()+s->key*5; | ||
150 | decendants += s->decendants; | ||
151 | if ( s->maxdepth+1 > maxdepth ) | ||
152 | maxdepth = s->maxdepth+1; | ||
153 | } | ||
154 | if ( decendants ) { | ||
155 | key += decendants + maxdepth*256 + children.count() * 65536; | ||
156 | if ( !key ) key++; // unlikely | ||
157 | } | ||
158 | TrieClub* c = directory[key]; | ||
159 | if ( !c ) directory.insert(key, (c = new TrieClub) ); | ||
160 | c->prepend(this); | ||
161 | } | ||
162 | |||
163 | QTrie* QTrie::clubLeader(TrieClubDirectory& directory) | ||
164 | { | ||
165 | if ( !key ) return directory[0]->first(); | ||
166 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | ||
167 | QTrie* t= (*it).p->clubLeader(directory); | ||
168 | (*it).p = t; | ||
169 | } | ||
170 | TrieClub *club = directory[key]; | ||
171 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { | ||
172 | QTrie* o = *it; | ||
173 | if ( o->equal(this) ) | ||
174 | return o; | ||
175 | } | ||
176 | return this; | ||
177 | } | ||
178 | |||
179 | int QTrie::collectKeys() | ||
180 | { | ||
181 | int n=0; | ||
182 | if ( key ) key=0,n+=children.count(); | ||
183 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) | ||
184 | n += (*it).p->collectKeys(); | ||
185 | return n; | ||
186 | } | ||
187 | |||
188 | int TriePtr::operator <(const TriePtr& o) const | ||
189 | { return letter < o.letter; } | ||
190 | int TriePtr::operator >(const TriePtr& o) const | ||
191 | { return letter > o.letter; } | ||
192 | int TriePtr::operator <=(const TriePtr& o) const | ||
193 | { return letter <= o.letter; } | ||
194 | |||
195 | bool TrieList::equal(TrieList& l) | ||
196 | { | ||
197 | if ( count() != l.count() ) | ||
198 | return FALSE; | ||
199 | sort(); l.sort(); | ||
200 | ConstIterator it2 = begin(); | ||
201 | ConstIterator it = l.begin(); | ||
202 | for( ; it != l.end(); ++it, ++it2 ) | ||
203 | if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) ) | ||
204 | return FALSE; | ||
205 | return TRUE; | ||
206 | } | ||
207 | QTrie* TrieList::findAdd(QChar c) | ||
208 | { | ||
209 | for (Iterator it=begin(); it!=end(); ++it) { | ||
210 | if ( (*it).letter == c ) | ||
211 | return (*it).p; | ||
212 | } | ||
213 | TriePtr p; | ||
214 | p.p = new QTrie; | ||
215 | p.letter = c; | ||
216 | prepend(p); | ||
217 | sorted=FALSE; | ||
218 | sort(); | ||
219 | return p.p; | ||
220 | } | ||
221 | |||
222 | static const char* dawg_sig = "QDAWG100"; | ||
223 | |||
224 | class QDawgPrivate { | ||
225 | public: | ||
226 | QDawgPrivate(QIODevice* dev) | ||
227 | { | ||
228 | QDataStream ds(dev); | ||
229 | char sig[8]; | ||
230 | ds.readRawBytes(sig,8); | ||
231 | if ( !strncmp(dawg_sig,sig,8) ) { | ||
232 | uint n; | ||
233 | char* nn; | ||
234 | ds.readBytes(nn,n); | ||
235 | |||
236 | // #### endianness problem ignored. | ||
237 | node = (QDawg::Node*)nn; | ||
238 | nodes = n / sizeof(QDawg::Node); | ||
239 | } else { | ||
240 | node = 0; | ||
241 | } | ||
242 | } | ||
243 | |||
244 | bool ok() const { return node; } | ||
245 | |||
246 | QDawgPrivate(uchar* mem) | ||
247 | { | ||
248 | if ( !strncmp(dawg_sig,(char*)mem,8) ) { | ||
249 | mem += 8; | ||
250 | |||
251 | int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3]; | ||
252 | mem += 4; | ||
253 | |||
254 | // #### endianness problem ignored. | ||
255 | node = (QDawg::Node*)((char*)mem); | ||
256 | nodes = n / sizeof(QDawg::Node); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | QDawgPrivate(QTrie* t) // destroys the QTrie. | ||
261 | { | ||
262 | TrieClubDirectory directory(9973); | ||
263 | t->distributeKeys(directory); | ||
264 | QTrie* l = t->clubLeader(directory); | ||
265 | ASSERT(l==t); | ||
266 | generateArray(t); | ||
267 | |||
268 | TrieClub *club; | ||
269 | for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit) | ||
270 | { | ||
271 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { | ||
272 | delete *it; | ||
273 | } | ||
274 | delete club; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | bool write(QIODevice* dev) | ||
279 | { | ||
280 | QDataStream ds(dev); | ||
281 | ds.writeRawBytes(dawg_sig,8); | ||
282 | // #### endianness problem ignored. | ||
283 | ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes); | ||
284 | return dev->state() == IO_Ok; | ||
285 | } | ||
286 | |||
287 | void dumpWords(int nid=0, int index=0) | ||
288 | { | ||
289 | static char word[256]; // ick latin1 | ||
290 | int i=0; | ||
291 | do { | ||
292 | QDawg::Node& n = node[nid+i]; | ||
293 | word[index] = n.let; | ||
294 | if ( n.isword ) | ||
295 | fprintf(stderr,"%.*s\n",index+1,word); | ||
296 | if ( n.offset ) dumpWords(n.offset+nid+i,index+1); | ||
297 | } while (!node[nid+i++].islast); | ||
298 | } | ||
299 | |||
300 | void dump(int nid=0, int indent=0) | ||
301 | { | ||
302 | int i=0; | ||
303 | do { | ||
304 | QDawg::Node& n = node[nid+i]; | ||
305 | fprintf(stderr,"%d: ",nid+i); | ||
306 | for (int in=0; in<indent; in++) | ||
307 | fputc(' ',stderr); | ||
308 | fprintf(stderr," %c %d %d %d\n",n.let, | ||
309 | n.isword,n.islast,n.offset); | ||
310 | if ( n.offset ) dump(n.offset+nid+i,indent+2); | ||
311 | } while (!node[nid+i++].islast); | ||
312 | } | ||
313 | |||
314 | int countWords(int nid=0) | ||
315 | { | ||
316 | int t=0; | ||
317 | int i=0; | ||
318 | do { | ||
319 | QDawg::Node& n = node[nid+i]; | ||
320 | if ( n.isword ) | ||
321 | t++; | ||
322 | if ( n.offset ) | ||
323 | t+=countWords(n.offset+nid+i); | ||
324 | } while (!node[nid+i++].islast); | ||
325 | return t; | ||
326 | } | ||
327 | |||
328 | bool contains(const QString& s, int nid=0, int index=0) const | ||
329 | { | ||
330 | int i=0; | ||
331 | do { | ||
332 | QDawg::Node& n = node[nid+i]; | ||
333 | if ( s[index] == QChar((ushort)n.let) ) { | ||
334 | if ( n.isword && index == (int)s.length()-1 ) | ||
335 | return TRUE; | ||
336 | if ( n.offset ) | ||
337 | return contains(s,n.offset+nid+i,index+1); | ||
338 | } | ||
339 | } while (!node[nid+i++].islast); | ||
340 | return FALSE; | ||
341 | } | ||
342 | |||
343 | void appendAllWords(QStringList& list, int nid=0, QString s="") const | ||
344 | { | ||
345 | int i=0; | ||
346 | int next = s.length(); | ||
347 | do { | ||
348 | QDawg::Node& n = node[nid+i]; | ||
349 | s[next] = QChar((ushort)n.let); | ||
350 | if ( n.isword ) | ||
351 | list.append(s); | ||
352 | if ( n.offset ) | ||
353 | appendAllWords(list, n.offset+nid+i, s); | ||
354 | } while (!node[nid+i++].islast); | ||
355 | } | ||
356 | |||
357 | const QDawg::Node* root() { return node; } | ||
358 | |||
359 | private: | ||
360 | void generateArray(QTrie* t) | ||
361 | { | ||
362 | nodes = 0; | ||
363 | int n = t->collectKeys(); | ||
364 | node = new QDawg::Node[n]; | ||
365 | appendToArray(t); | ||
366 | ASSERT(n == nodes); | ||
367 | } | ||
368 | |||
369 | int appendToArray(QTrie* t) | ||
370 | { | ||
371 | if ( !t->key ) { | ||
372 | if ( !t->children.count() ) | ||
373 | return 0; | ||
374 | t->key = nodes; | ||
375 | nodes += t->children.count(); | ||
376 | QDawg::Node* n = &node[t->key-1]; | ||
377 | int here = t->key; | ||
378 | for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) { | ||
379 | QTrie* s = (*it).p; | ||
380 | ++n; | ||
381 | n->let = (*it).letter.unicode(); | ||
382 | n->isword = s->isword; | ||
383 | n->islast = 0; | ||
384 | n->offset = appendToArray(s); | ||
385 | if ( n->offset ) { | ||
386 | int t = n->offset-here; | ||
387 | n->offset=t; | ||
388 | if ( n->offset != t ) | ||
389 | qWarning("Overflow: too many words"); | ||
390 | } | ||
391 | here++; | ||
392 | } | ||
393 | n->islast = 1; | ||
394 | } | ||
395 | return t->key; | ||
396 | } | ||
397 | |||
398 | private: | ||
399 | int nodes; | ||
400 | QDawg::Node *node; | ||
401 | }; | ||
402 | |||
403 | |||
404 | QDawg::QDawg() | ||
405 | { | ||
406 | d = 0; | ||
407 | } | ||
408 | |||
409 | QDawg::~QDawg() | ||
410 | { | ||
411 | delete d; | ||
412 | } | ||
413 | |||
414 | bool QDawg::createFromWords(QIODevice* dev) | ||
415 | { | ||
416 | delete d; | ||
417 | |||
418 | QTextStream i(dev); | ||
419 | QTrie* trie = new QTrie; | ||
420 | int n=0; | ||
421 | while (!i.atEnd()) { | ||
422 | trie->insertWord(QString::fromUtf8(i.readLine())); | ||
423 | n++; | ||
424 | } | ||
425 | if ( n ) | ||
426 | d = new QDawgPrivate(trie); | ||
427 | else | ||
428 | d = 0; | ||
429 | return TRUE; | ||
430 | } | ||
431 | |||
432 | void QDawg::createFromWords(const QStringList& list) | ||
433 | { | ||
434 | delete d; | ||
435 | |||
436 | if ( list.count() ) { | ||
437 | QTrie* trie = new QTrie; | ||
438 | for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) { | ||
439 | trie->insertWord(*it); | ||
440 | } | ||
441 | d = new QDawgPrivate(trie); | ||
442 | } else { | ||
443 | d = 0; | ||
444 | } | ||
445 | } | ||
446 | |||
447 | QStringList QDawg::allWords() const | ||
448 | { | ||
449 | QStringList result; | ||
450 | if ( d ) d->appendAllWords(result); | ||
451 | return result; | ||
452 | } | ||
453 | |||
454 | |||
455 | bool QDawg::readFile(const QString& filename) | ||
456 | { | ||
457 | delete d; | ||
458 | d = 0; | ||
459 | int f = ::open( QFile::encodeName(filename), O_RDONLY ); | ||
460 | if ( f < 0 ) | ||
461 | return FALSE; | ||
462 | struct stat st; | ||
463 | if ( !fstat( f, &st ) ) { | ||
464 | char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file | ||
465 | PROT_READ, // read-only memory | ||
466 | MAP_FILE | MAP_PRIVATE, // swap-backed map from file | ||
467 | f, 0 ); // from offset 0 of f | ||
468 | if ( tmp && tmp != (char*)MAP_FAILED ) | ||
469 | d = new QDawgPrivate((uchar*)tmp); | ||
470 | } | ||
471 | ::close( f ); | ||
472 | return d; | ||
473 | } | ||
474 | |||
475 | bool QDawg::read(QIODevice* dev) | ||
476 | { | ||
477 | delete d; | ||
478 | d = new QDawgPrivate(dev); | ||
479 | if ( d->ok() ) | ||
480 | return TRUE; | ||
481 | delete d; | ||
482 | d = 0; | ||
483 | return FALSE; | ||
484 | } | ||
485 | |||
486 | bool QDawg::write(QIODevice* dev) const | ||
487 | { | ||
488 | return d ? d->write(dev) : TRUE; | ||
489 | } | ||
490 | |||
491 | int QDawg::countWords() const | ||
492 | { | ||
493 | return d ? d->countWords() : 0; | ||
494 | } | ||
495 | |||
496 | const QDawg::Node* QDawg::root() const | ||
497 | { | ||
498 | return d ? d->root() : 0; | ||
499 | } | ||
500 | |||
501 | bool QDawg::contains(const QString& s) const | ||
502 | { | ||
503 | return d ? d->contains(s) : FALSE; | ||
504 | } | ||
505 | |||
506 | void QDawg::dump() const | ||
507 | { | ||
508 | if ( d ) d->dump(); | ||
509 | } | ||
510 | |||