-rw-r--r-- | library/qdawg.cpp | 2 |
1 files changed, 0 insertions, 2 deletions
diff --git a/library/qdawg.cpp b/library/qdawg.cpp index af5dc82..2ea5734 100644 --- a/library/qdawg.cpp +++ b/library/qdawg.cpp | |||
@@ -1,535 +1,533 @@ | |||
1 | /********************************************************************** | 1 | /********************************************************************** |
2 | ** Copyright (C) 2000-2002 Trolltech AS. All rights reserved. | 2 | ** Copyright (C) 2000-2002 Trolltech AS. All rights reserved. |
3 | ** | 3 | ** |
4 | ** This file is part of the Qtopia Environment. | 4 | ** This file is part of the Qtopia Environment. |
5 | ** | 5 | ** |
6 | ** This file may be distributed and/or modified under the terms of the | 6 | ** This file may be distributed and/or modified under the terms of the |
7 | ** GNU General Public License version 2 as published by the Free Software | 7 | ** GNU General Public License version 2 as published by the Free Software |
8 | ** Foundation and appearing in the file LICENSE.GPL included in the | 8 | ** Foundation and appearing in the file LICENSE.GPL included in the |
9 | ** packaging of this file. | 9 | ** packaging of this file. |
10 | ** | 10 | ** |
11 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE | 11 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE |
12 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | 12 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
13 | ** | 13 | ** |
14 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. | 14 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. |
15 | ** | 15 | ** |
16 | ** Contact info@trolltech.com if any conditions of this licensing are | 16 | ** Contact info@trolltech.com if any conditions of this licensing are |
17 | ** not clear to you. | 17 | ** not clear to you. |
18 | ** | 18 | ** |
19 | **********************************************************************/ | 19 | **********************************************************************/ |
20 | #include "qdawg.h" | 20 | #include "qdawg.h" |
21 | #include <qintdict.h> | 21 | #include <qintdict.h> |
22 | #include <qvaluelist.h> | ||
23 | #include <qtextstream.h> | ||
24 | #include <qfile.h> | 22 | #include <qfile.h> |
25 | #include <qtl.h> | 23 | #include <qtl.h> |
26 | 24 | ||
27 | #include <limits.h> | 25 | #include <limits.h> |
28 | #include <stdio.h> | 26 | #include <stdio.h> |
29 | 27 | ||
30 | // for mmap | 28 | // for mmap |
31 | #include <sys/types.h> | 29 | #include <sys/types.h> |
32 | #include <sys/stat.h> | 30 | #include <sys/stat.h> |
33 | #include <sys/mman.h> | 31 | #include <sys/mman.h> |
34 | #include <fcntl.h> | 32 | #include <fcntl.h> |
35 | #include <errno.h> | 33 | #include <errno.h> |
36 | #include <unistd.h> | 34 | #include <unistd.h> |
37 | 35 | ||
38 | class QDawgPrivate; | 36 | class QDawgPrivate; |
39 | class QTrie; | 37 | class QTrie; |
40 | 38 | ||
41 | typedef QValueList<QTrie*> TrieClub; | 39 | typedef QValueList<QTrie*> TrieClub; |
42 | typedef QIntDict<TrieClub> TrieClubDirectory; | 40 | typedef QIntDict<TrieClub> TrieClubDirectory; |
43 | 41 | ||
44 | class TriePtr { | 42 | class TriePtr { |
45 | public: | 43 | public: |
46 | QChar letter; | 44 | QChar letter; |
47 | QTrie* p; | 45 | QTrie* p; |
48 | int operator <(const TriePtr& o) const; | 46 | int operator <(const TriePtr& o) const; |
49 | int operator >(const TriePtr& o) const; | 47 | int operator >(const TriePtr& o) const; |
50 | int operator <=(const TriePtr& o) const; | 48 | int operator <=(const TriePtr& o) const; |
51 | }; | 49 | }; |
52 | 50 | ||
53 | class TrieList : public QValueList<TriePtr> { | 51 | class TrieList : public QValueList<TriePtr> { |
54 | bool sorted; | 52 | bool sorted; |
55 | public: | 53 | public: |
56 | TrieList() | 54 | TrieList() |
57 | { | 55 | { |
58 | sorted=TRUE; | 56 | sorted=TRUE; |
59 | } | 57 | } |
60 | 58 | ||
61 | QTrie* findAdd(QChar c); | 59 | QTrie* findAdd(QChar c); |
62 | bool equal(TrieList& l); | 60 | bool equal(TrieList& l); |
63 | 61 | ||
64 | void sort() | 62 | void sort() |
65 | { | 63 | { |
66 | if ( !sorted ) { | 64 | if ( !sorted ) { |
67 | qHeapSort(*this); | 65 | qHeapSort(*this); |
68 | sorted = TRUE; | 66 | sorted = TRUE; |
69 | } | 67 | } |
70 | } | 68 | } |
71 | }; | 69 | }; |
72 | 70 | ||
73 | // A fast but memory-wasting temporary class. The Dawg is the goal. | 71 | // A fast but memory-wasting temporary class. The Dawg is the goal. |
74 | class QTrie { | 72 | class QTrie { |
75 | public: | 73 | public: |
76 | QTrie(); | 74 | QTrie(); |
77 | ~QTrie(); | 75 | ~QTrie(); |
78 | 76 | ||
79 | void insertWord(const QString& s, uint index=0); | 77 | void insertWord(const QString& s, uint index=0); |
80 | bool equal(QTrie* o); | 78 | bool equal(QTrie* o); |
81 | void dump(int indent=0); | 79 | void dump(int indent=0); |
82 | 80 | ||
83 | private: | 81 | private: |
84 | TrieList children; | 82 | TrieList children; |
85 | bool isword; | 83 | bool isword; |
86 | 84 | ||
87 | friend class QDawgPrivate; | 85 | friend class QDawgPrivate; |
88 | int maxdepth; | 86 | int maxdepth; |
89 | int decendants; | 87 | int decendants; |
90 | int key; | 88 | int key; |
91 | void distributeKeys(TrieClubDirectory& directory); | 89 | void distributeKeys(TrieClubDirectory& directory); |
92 | QTrie* clubLeader(TrieClubDirectory& directory); | 90 | QTrie* clubLeader(TrieClubDirectory& directory); |
93 | int collectKeys(); | 91 | int collectKeys(); |
94 | friend class TriePtr; | 92 | friend class TriePtr; |
95 | friend class TrieList; | 93 | friend class TrieList; |
96 | }; | 94 | }; |
97 | 95 | ||
98 | QTrie::QTrie() | 96 | QTrie::QTrie() |
99 | { | 97 | { |
100 | key = 0; | 98 | key = 0; |
101 | isword = FALSE; | 99 | isword = FALSE; |
102 | } | 100 | } |
103 | 101 | ||
104 | QTrie::~QTrie() | 102 | QTrie::~QTrie() |
105 | { | 103 | { |
106 | // NOTE: we do not delete the children - after conversion to DAWG | 104 | // NOTE: we do not delete the children - after conversion to DAWG |
107 | // it's too difficult. The QTrie's are deleted via the directory. | 105 | // it's too difficult. The QTrie's are deleted via the directory. |
108 | } | 106 | } |
109 | 107 | ||
110 | void QTrie::insertWord(const QString& s, uint index) | 108 | void QTrie::insertWord(const QString& s, uint index) |
111 | { | 109 | { |
112 | if ( index == s.length() ) { | 110 | if ( index == s.length() ) { |
113 | isword = TRUE; | 111 | isword = TRUE; |
114 | } else { | 112 | } else { |
115 | QTrie* t = children.findAdd(s[index]); | 113 | QTrie* t = children.findAdd(s[index]); |
116 | t->insertWord(s,index+1); | 114 | t->insertWord(s,index+1); |
117 | } | 115 | } |
118 | } | 116 | } |
119 | 117 | ||
120 | bool QTrie::equal(QTrie* o) | 118 | bool QTrie::equal(QTrie* o) |
121 | { | 119 | { |
122 | if ( o == this ) return TRUE; | 120 | if ( o == this ) return TRUE; |
123 | if ( isword != o->isword ) | 121 | if ( isword != o->isword ) |
124 | return FALSE; | 122 | return FALSE; |
125 | return children.equal(o->children); | 123 | return children.equal(o->children); |
126 | } | 124 | } |
127 | 125 | ||
128 | void QTrie::dump(int indent) | 126 | void QTrie::dump(int indent) |
129 | { | 127 | { |
130 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | 128 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { |
131 | QTrie* s = (*it).p; | 129 | QTrie* s = (*it).p; |
132 | for (int in=0; in<indent; in++) | 130 | for (int in=0; in<indent; in++) |
133 | fputc(' ',stderr); | 131 | fputc(' ',stderr); |
134 | fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(), | 132 | fprintf(stderr," %c %d %s %p\n",(*it).letter.unicode(), |
135 | s->key,s->isword?"word":"",s); | 133 | s->key,s->isword?"word":"",s); |
136 | s->dump(indent+2); | 134 | s->dump(indent+2); |
137 | } | 135 | } |
138 | } | 136 | } |
139 | 137 | ||
140 | void QTrie::distributeKeys(TrieClubDirectory& directory) | 138 | void QTrie::distributeKeys(TrieClubDirectory& directory) |
141 | { | 139 | { |
142 | maxdepth = INT_MIN; | 140 | maxdepth = INT_MIN; |
143 | decendants = children.count(); | 141 | decendants = children.count(); |
144 | key = 0; | 142 | key = 0; |
145 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | 143 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { |
146 | QTrie* s = (*it).p; | 144 | QTrie* s = (*it).p; |
147 | QChar l = (*it).letter; | 145 | QChar l = (*it).letter; |
148 | s->distributeKeys(directory); | 146 | s->distributeKeys(directory); |
149 | key = key*64+l.unicode()+s->key*5; | 147 | key = key*64+l.unicode()+s->key*5; |
150 | decendants += s->decendants; | 148 | decendants += s->decendants; |
151 | if ( s->maxdepth+1 > maxdepth ) | 149 | if ( s->maxdepth+1 > maxdepth ) |
152 | maxdepth = s->maxdepth+1; | 150 | maxdepth = s->maxdepth+1; |
153 | } | 151 | } |
154 | if ( decendants ) { | 152 | if ( decendants ) { |
155 | key += decendants + maxdepth*256 + children.count() * 65536; | 153 | key += decendants + maxdepth*256 + children.count() * 65536; |
156 | if ( !key ) key++; // unlikely | 154 | if ( !key ) key++; // unlikely |
157 | } | 155 | } |
158 | TrieClub* c = directory[key]; | 156 | TrieClub* c = directory[key]; |
159 | if ( !c ) directory.insert(key, (c = new TrieClub) ); | 157 | if ( !c ) directory.insert(key, (c = new TrieClub) ); |
160 | c->prepend(this); | 158 | c->prepend(this); |
161 | } | 159 | } |
162 | 160 | ||
163 | QTrie* QTrie::clubLeader(TrieClubDirectory& directory) | 161 | QTrie* QTrie::clubLeader(TrieClubDirectory& directory) |
164 | { | 162 | { |
165 | if ( !key ) return directory[0]->first(); | 163 | if ( !key ) return directory[0]->first(); |
166 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { | 164 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) { |
167 | QTrie* t= (*it).p->clubLeader(directory); | 165 | QTrie* t= (*it).p->clubLeader(directory); |
168 | (*it).p = t; | 166 | (*it).p = t; |
169 | } | 167 | } |
170 | TrieClub *club = directory[key]; | 168 | TrieClub *club = directory[key]; |
171 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { | 169 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { |
172 | QTrie* o = *it; | 170 | QTrie* o = *it; |
173 | if ( o->equal(this) ) | 171 | if ( o->equal(this) ) |
174 | return o; | 172 | return o; |
175 | } | 173 | } |
176 | return this; | 174 | return this; |
177 | } | 175 | } |
178 | 176 | ||
179 | int QTrie::collectKeys() | 177 | int QTrie::collectKeys() |
180 | { | 178 | { |
181 | int n=0; | 179 | int n=0; |
182 | if ( key ) key=0,n+=children.count(); | 180 | if ( key ) key=0,n+=children.count(); |
183 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) | 181 | for (TrieList::Iterator it=children.begin(); it!=children.end(); ++it) |
184 | n += (*it).p->collectKeys(); | 182 | n += (*it).p->collectKeys(); |
185 | return n; | 183 | return n; |
186 | } | 184 | } |
187 | 185 | ||
188 | int TriePtr::operator <(const TriePtr& o) const | 186 | int TriePtr::operator <(const TriePtr& o) const |
189 | { return letter < o.letter; } | 187 | { return letter < o.letter; } |
190 | int TriePtr::operator >(const TriePtr& o) const | 188 | int TriePtr::operator >(const TriePtr& o) const |
191 | { return letter > o.letter; } | 189 | { return letter > o.letter; } |
192 | int TriePtr::operator <=(const TriePtr& o) const | 190 | int TriePtr::operator <=(const TriePtr& o) const |
193 | { return letter <= o.letter; } | 191 | { return letter <= o.letter; } |
194 | 192 | ||
195 | bool TrieList::equal(TrieList& l) | 193 | bool TrieList::equal(TrieList& l) |
196 | { | 194 | { |
197 | if ( count() != l.count() ) | 195 | if ( count() != l.count() ) |
198 | return FALSE; | 196 | return FALSE; |
199 | sort(); l.sort(); | 197 | sort(); l.sort(); |
200 | ConstIterator it2 = begin(); | 198 | ConstIterator it2 = begin(); |
201 | ConstIterator it = l.begin(); | 199 | ConstIterator it = l.begin(); |
202 | for( ; it != l.end(); ++it, ++it2 ) | 200 | for( ; it != l.end(); ++it, ++it2 ) |
203 | if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) ) | 201 | if ( (*it).letter != (*it2).letter || ! (*it).p->equal((*it2).p) ) |
204 | return FALSE; | 202 | return FALSE; |
205 | return TRUE; | 203 | return TRUE; |
206 | } | 204 | } |
207 | QTrie* TrieList::findAdd(QChar c) | 205 | QTrie* TrieList::findAdd(QChar c) |
208 | { | 206 | { |
209 | for (Iterator it=begin(); it!=end(); ++it) { | 207 | for (Iterator it=begin(); it!=end(); ++it) { |
210 | if ( (*it).letter == c ) | 208 | if ( (*it).letter == c ) |
211 | return (*it).p; | 209 | return (*it).p; |
212 | } | 210 | } |
213 | TriePtr p; | 211 | TriePtr p; |
214 | p.p = new QTrie; | 212 | p.p = new QTrie; |
215 | p.letter = c; | 213 | p.letter = c; |
216 | prepend(p); | 214 | prepend(p); |
217 | sorted=FALSE; | 215 | sorted=FALSE; |
218 | sort(); | 216 | sort(); |
219 | return p.p; | 217 | return p.p; |
220 | } | 218 | } |
221 | 219 | ||
222 | static const char* dawg_sig = "QDAWG100"; | 220 | static const char* dawg_sig = "QDAWG100"; |
223 | 221 | ||
224 | class QDawgPrivate { | 222 | class QDawgPrivate { |
225 | public: | 223 | public: |
226 | QDawgPrivate(QIODevice* dev) | 224 | QDawgPrivate(QIODevice* dev) |
227 | { | 225 | { |
228 | QDataStream ds(dev); | 226 | QDataStream ds(dev); |
229 | char sig[8]; | 227 | char sig[8]; |
230 | ds.readRawBytes(sig,8); | 228 | ds.readRawBytes(sig,8); |
231 | if ( !strncmp(dawg_sig,sig,8) ) { | 229 | if ( !strncmp(dawg_sig,sig,8) ) { |
232 | uint n; | 230 | uint n; |
233 | char* nn; | 231 | char* nn; |
234 | ds.readBytes(nn,n); | 232 | ds.readBytes(nn,n); |
235 | 233 | ||
236 | // #### endianness problem ignored. | 234 | // #### endianness problem ignored. |
237 | node = (QDawg::Node*)nn; | 235 | node = (QDawg::Node*)nn; |
238 | nodes = n / sizeof(QDawg::Node); | 236 | nodes = n / sizeof(QDawg::Node); |
239 | } else { | 237 | } else { |
240 | node = 0; | 238 | node = 0; |
241 | } | 239 | } |
242 | } | 240 | } |
243 | 241 | ||
244 | bool ok() const { return node; } | 242 | bool ok() const { return node; } |
245 | 243 | ||
246 | QDawgPrivate(uchar* mem) | 244 | QDawgPrivate(uchar* mem) |
247 | { | 245 | { |
248 | if ( !strncmp(dawg_sig,(char*)mem,8) ) { | 246 | if ( !strncmp(dawg_sig,(char*)mem,8) ) { |
249 | mem += 8; | 247 | mem += 8; |
250 | 248 | ||
251 | int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3]; | 249 | int n = ((mem[0]*256+mem[1])*256+mem[2])*256+mem[3]; |
252 | mem += 4; | 250 | mem += 4; |
253 | 251 | ||
254 | // #### endianness problem ignored. | 252 | // #### endianness problem ignored. |
255 | node = (QDawg::Node*)((char*)mem); | 253 | node = (QDawg::Node*)((char*)mem); |
256 | nodes = n / sizeof(QDawg::Node); | 254 | nodes = n / sizeof(QDawg::Node); |
257 | } | 255 | } |
258 | } | 256 | } |
259 | 257 | ||
260 | QDawgPrivate(QTrie* t) // destroys the QTrie. | 258 | QDawgPrivate(QTrie* t) // destroys the QTrie. |
261 | { | 259 | { |
262 | TrieClubDirectory directory(9973); | 260 | TrieClubDirectory directory(9973); |
263 | t->distributeKeys(directory); | 261 | t->distributeKeys(directory); |
264 | QTrie* l = t->clubLeader(directory); | 262 | QTrie* l = t->clubLeader(directory); |
265 | ASSERT(l==t); | 263 | ASSERT(l==t); |
266 | generateArray(t); | 264 | generateArray(t); |
267 | 265 | ||
268 | TrieClub *club; | 266 | TrieClub *club; |
269 | for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit) | 267 | for (QIntDictIterator<TrieClub> dit(directory); (club=dit); ++dit) |
270 | { | 268 | { |
271 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { | 269 | for (TrieClub::Iterator it = club->begin(); it != club->end(); ++it) { |
272 | delete *it; | 270 | delete *it; |
273 | } | 271 | } |
274 | delete club; | 272 | delete club; |
275 | } | 273 | } |
276 | } | 274 | } |
277 | 275 | ||
278 | bool write(QIODevice* dev) | 276 | bool write(QIODevice* dev) |
279 | { | 277 | { |
280 | QDataStream ds(dev); | 278 | QDataStream ds(dev); |
281 | ds.writeRawBytes(dawg_sig,8); | 279 | ds.writeRawBytes(dawg_sig,8); |
282 | // #### endianness problem ignored. | 280 | // #### endianness problem ignored. |
283 | ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes); | 281 | ds.writeBytes((char*)node,sizeof(QDawg::Node)*nodes); |
284 | return dev->state() == IO_Ok; | 282 | return dev->state() == IO_Ok; |
285 | } | 283 | } |
286 | 284 | ||
287 | void dumpWords(int nid=0, int index=0) | 285 | void dumpWords(int nid=0, int index=0) |
288 | { | 286 | { |
289 | static char word[256]; // ick latin1 | 287 | static char word[256]; // ick latin1 |
290 | int i=0; | 288 | int i=0; |
291 | do { | 289 | do { |
292 | QDawg::Node& n = node[nid+i]; | 290 | QDawg::Node& n = node[nid+i]; |
293 | word[index] = n.let; | 291 | word[index] = n.let; |
294 | if ( n.isword ) | 292 | if ( n.isword ) |
295 | fprintf(stderr,"%.*s\n",index+1,word); | 293 | fprintf(stderr,"%.*s\n",index+1,word); |
296 | if ( n.offset ) dumpWords(n.offset+nid+i,index+1); | 294 | if ( n.offset ) dumpWords(n.offset+nid+i,index+1); |
297 | } while (!node[nid+i++].islast); | 295 | } while (!node[nid+i++].islast); |
298 | } | 296 | } |
299 | 297 | ||
300 | void dump(int nid=0, int indent=0) | 298 | void dump(int nid=0, int indent=0) |
301 | { | 299 | { |
302 | int i=0; | 300 | int i=0; |
303 | do { | 301 | do { |
304 | QDawg::Node& n = node[nid+i]; | 302 | QDawg::Node& n = node[nid+i]; |
305 | fprintf(stderr,"%d: ",nid+i); | 303 | fprintf(stderr,"%d: ",nid+i); |
306 | for (int in=0; in<indent; in++) | 304 | for (int in=0; in<indent; in++) |
307 | fputc(' ',stderr); | 305 | fputc(' ',stderr); |
308 | fprintf(stderr," %c %d %d %d\n",n.let, | 306 | fprintf(stderr," %c %d %d %d\n",n.let, |
309 | n.isword,n.islast,n.offset); | 307 | n.isword,n.islast,n.offset); |
310 | if ( n.offset ) dump(n.offset+nid+i,indent+2); | 308 | if ( n.offset ) dump(n.offset+nid+i,indent+2); |
311 | } while (!node[nid+i++].islast); | 309 | } while (!node[nid+i++].islast); |
312 | } | 310 | } |
313 | 311 | ||
314 | int countWords(int nid=0) | 312 | int countWords(int nid=0) |
315 | { | 313 | { |
316 | int t=0; | 314 | int t=0; |
317 | int i=0; | 315 | int i=0; |
318 | do { | 316 | do { |
319 | QDawg::Node& n = node[nid+i]; | 317 | QDawg::Node& n = node[nid+i]; |
320 | if ( n.isword ) | 318 | if ( n.isword ) |
321 | t++; | 319 | t++; |
322 | if ( n.offset ) | 320 | if ( n.offset ) |
323 | t+=countWords(n.offset+nid+i); | 321 | t+=countWords(n.offset+nid+i); |
324 | } while (!node[nid+i++].islast); | 322 | } while (!node[nid+i++].islast); |
325 | return t; | 323 | return t; |
326 | } | 324 | } |
327 | 325 | ||
328 | bool contains(const QString& s, int nid=0, int index=0) const | 326 | bool contains(const QString& s, int nid=0, int index=0) const |
329 | { | 327 | { |
330 | int i=0; | 328 | int i=0; |
331 | do { | 329 | do { |
332 | QDawg::Node& n = node[nid+i]; | 330 | QDawg::Node& n = node[nid+i]; |
333 | if ( s[index] == QChar((ushort)n.let) ) { | 331 | if ( s[index] == QChar((ushort)n.let) ) { |
334 | if ( n.isword && index == (int)s.length()-1 ) | 332 | if ( n.isword && index == (int)s.length()-1 ) |
335 | return TRUE; | 333 | return TRUE; |
336 | if ( n.offset ) | 334 | if ( n.offset ) |
337 | return contains(s,n.offset+nid+i,index+1); | 335 | return contains(s,n.offset+nid+i,index+1); |
338 | } | 336 | } |
339 | } while (!node[nid+i++].islast); | 337 | } while (!node[nid+i++].islast); |
340 | return FALSE; | 338 | return FALSE; |
341 | } | 339 | } |
342 | 340 | ||
343 | void appendAllWords(QStringList& list, int nid=0, QString s="") const | 341 | void appendAllWords(QStringList& list, int nid=0, QString s="") const |
344 | { | 342 | { |
345 | int i=0; | 343 | int i=0; |
346 | int next = s.length(); | 344 | int next = s.length(); |
347 | do { | 345 | do { |
348 | QDawg::Node& n = node[nid+i]; | 346 | QDawg::Node& n = node[nid+i]; |
349 | s[next] = QChar((ushort)n.let); | 347 | s[next] = QChar((ushort)n.let); |
350 | if ( n.isword ) | 348 | if ( n.isword ) |
351 | list.append(s); | 349 | list.append(s); |
352 | if ( n.offset ) | 350 | if ( n.offset ) |
353 | appendAllWords(list, n.offset+nid+i, s); | 351 | appendAllWords(list, n.offset+nid+i, s); |
354 | } while (!node[nid+i++].islast); | 352 | } while (!node[nid+i++].islast); |
355 | } | 353 | } |
356 | 354 | ||
357 | const QDawg::Node* root() { return node; } | 355 | const QDawg::Node* root() { return node; } |
358 | 356 | ||
359 | private: | 357 | private: |
360 | void generateArray(QTrie* t) | 358 | void generateArray(QTrie* t) |
361 | { | 359 | { |
362 | nodes = 0; | 360 | nodes = 0; |
363 | int n = t->collectKeys(); | 361 | int n = t->collectKeys(); |
364 | node = new QDawg::Node[n]; | 362 | node = new QDawg::Node[n]; |
365 | appendToArray(t); | 363 | appendToArray(t); |
366 | ASSERT(n == nodes); | 364 | ASSERT(n == nodes); |
367 | } | 365 | } |
368 | 366 | ||
369 | int appendToArray(QTrie* t) | 367 | int appendToArray(QTrie* t) |
370 | { | 368 | { |
371 | if ( !t->key ) { | 369 | if ( !t->key ) { |
372 | if ( !t->children.count() ) | 370 | if ( !t->children.count() ) |
373 | return 0; | 371 | return 0; |
374 | t->key = nodes; | 372 | t->key = nodes; |
375 | nodes += t->children.count(); | 373 | nodes += t->children.count(); |
376 | QDawg::Node* n = &node[t->key-1]; | 374 | QDawg::Node* n = &node[t->key-1]; |
377 | int here = t->key; | 375 | int here = t->key; |
378 | for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) { | 376 | for (TrieList::Iterator it=t->children.begin(); it!=t->children.end(); ++it) { |
379 | QTrie* s = (*it).p; | 377 | QTrie* s = (*it).p; |
380 | ++n; | 378 | ++n; |
381 | n->let = (*it).letter.unicode(); | 379 | n->let = (*it).letter.unicode(); |
382 | n->isword = s->isword; | 380 | n->isword = s->isword; |
383 | n->islast = 0; | 381 | n->islast = 0; |
384 | n->offset = appendToArray(s); | 382 | n->offset = appendToArray(s); |
385 | if ( n->offset ) { | 383 | if ( n->offset ) { |
386 | int t = n->offset-here; | 384 | int t = n->offset-here; |
387 | n->offset=t; | 385 | n->offset=t; |
388 | if ( n->offset != t ) | 386 | if ( n->offset != t ) |
389 | qWarning("Overflow: too many words"); | 387 | qWarning("Overflow: too many words"); |
390 | } | 388 | } |
391 | here++; | 389 | here++; |
392 | } | 390 | } |
393 | n->islast = 1; | 391 | n->islast = 1; |
394 | } | 392 | } |
395 | return t->key; | 393 | return t->key; |
396 | } | 394 | } |
397 | 395 | ||
398 | private: | 396 | private: |
399 | int nodes; | 397 | int nodes; |
400 | QDawg::Node *node; | 398 | QDawg::Node *node; |
401 | }; | 399 | }; |
402 | 400 | ||
403 | /*! | 401 | /*! |
404 | \class QDawg qdawg.h | 402 | \class QDawg qdawg.h |
405 | \brief The QDawg class provides an implementation of a Directed Acyclic Word Graph. | 403 | \brief The QDawg class provides an implementation of a Directed Acyclic Word Graph. |
406 | 404 | ||
407 | A DAWG provides very fast look-up of words in a word list. | 405 | A DAWG provides very fast look-up of words in a word list. |
408 | 406 | ||
409 | The word list is created using readFile(), read() or | 407 | The word list is created using readFile(), read() or |
410 | createFromWords(). A list of all the DAWG's words is returned by | 408 | createFromWords(). A list of all the DAWG's words is returned by |
411 | allWords(), and the total number of words is returned by | 409 | allWords(), and the total number of words is returned by |
412 | countWords(). Use contains() to see if a particular word is in the | 410 | countWords(). Use contains() to see if a particular word is in the |
413 | DAWG. The root \link qdawg-node.html node\endlink is returned by root(). | 411 | DAWG. The root \link qdawg-node.html node\endlink is returned by root(). |
414 | 412 | ||
415 | A global DAWG is maintained for the current locale. See the | 413 | A global DAWG is maintained for the current locale. See the |
416 | \l Global class for details. | 414 | \l Global class for details. |
417 | 415 | ||
418 | The structure of a DAWG is a graph of \link qdawg-node.html | 416 | The structure of a DAWG is a graph of \link qdawg-node.html |
419 | Nodes\endlink. There are no cycles in the graph (since there are no | 417 | Nodes\endlink. There are no cycles in the graph (since there are no |
420 | inifinitely repeating words). Each \link qdawg-node.html | 418 | inifinitely repeating words). Each \link qdawg-node.html |
421 | Node\endlink is a member of a list of \link qdawg-node.html | 419 | Node\endlink is a member of a list of \link qdawg-node.html |
422 | Nodes\endlink called a child list. Each \link qdawg-node.html | 420 | Nodes\endlink called a child list. Each \link qdawg-node.html |
423 | Node\endlink in the child list has a \e letter, an \e isWord flag, | 421 | Node\endlink in the child list has a \e letter, an \e isWord flag, |
424 | at most one \e jump arc, and at most one arc to the next child in | 422 | at most one \e jump arc, and at most one arc to the next child in |
425 | the list. | 423 | the list. |
426 | 424 | ||
427 | If you traverse the \link qdawg-node.html Nodes\endlink in a DAWG, | 425 | If you traverse the \link qdawg-node.html Nodes\endlink in a DAWG, |
428 | starting from the root(), and you concatenate all the letters from | 426 | starting from the root(), and you concatenate all the letters from |
429 | the single child in each child list that you visit, at every \link | 427 | the single child in each child list that you visit, at every \link |
430 | qdawg-node.html Node\endlink which has the isWord flag set your | 428 | qdawg-node.html Node\endlink which has the isWord flag set your |
431 | concatenation will be a word in the list represented by the DAWG. | 429 | concatenation will be a word in the list represented by the DAWG. |
432 | 430 | ||
433 | For example, the DAWG below represents the word list: | 431 | For example, the DAWG below represents the word list: |
434 | ban, band, can, cane, cans, pan, pane, pans. | 432 | ban, band, can, cane, cans, pan, pane, pans. |
435 | 433 | ||
436 | This structuring not only provides O(1) lookup of words in the word list, | 434 | This structuring not only provides O(1) lookup of words in the word list, |
437 | but also produces a smaller storage file than a plain text file word list. | 435 | but also produces a smaller storage file than a plain text file word list. |
438 | 436 | ||
439 | \img qdawg.png | 437 | \img qdawg.png |
440 | */ | 438 | */ |
441 | 439 | ||
442 | /*! | 440 | /*! |
443 | Constructs a new empty DAWG. | 441 | Constructs a new empty DAWG. |
444 | */ | 442 | */ |
445 | QDawg::QDawg() | 443 | QDawg::QDawg() |
446 | { | 444 | { |
447 | d = 0; | 445 | d = 0; |
448 | } | 446 | } |
449 | 447 | ||
450 | /*! | 448 | /*! |
451 | Deletes the DAWG. | 449 | Deletes the DAWG. |
452 | */ | 450 | */ |
453 | QDawg::~QDawg() | 451 | QDawg::~QDawg() |
454 | { | 452 | { |
455 | delete d; | 453 | delete d; |
456 | } | 454 | } |
457 | 455 | ||
458 | /*! | 456 | /*! |
459 | \overload | 457 | \overload |
460 | Replaces all the DAWG's words with words read from \a dev. | 458 | Replaces all the DAWG's words with words read from \a dev. |
461 | */ | 459 | */ |
462 | bool QDawg::createFromWords(QIODevice* dev) | 460 | bool QDawg::createFromWords(QIODevice* dev) |
463 | { | 461 | { |
464 | delete d; | 462 | delete d; |
465 | 463 | ||
466 | QTextStream i(dev); | 464 | QTextStream i(dev); |
467 | QTrie* trie = new QTrie; | 465 | QTrie* trie = new QTrie; |
468 | int n=0; | 466 | int n=0; |
469 | while (!i.atEnd()) { | 467 | while (!i.atEnd()) { |
470 | trie->insertWord(QString::fromUtf8(i.readLine())); | 468 | trie->insertWord(QString::fromUtf8(i.readLine())); |
471 | n++; | 469 | n++; |
472 | } | 470 | } |
473 | if ( n ) | 471 | if ( n ) |
474 | d = new QDawgPrivate(trie); | 472 | d = new QDawgPrivate(trie); |
475 | else | 473 | else |
476 | d = 0; | 474 | d = 0; |
477 | return TRUE; | 475 | return TRUE; |
478 | } | 476 | } |
479 | 477 | ||
480 | /*! | 478 | /*! |
481 | Replaces all the DAWG's words with the words in the \a list. | 479 | Replaces all the DAWG's words with the words in the \a list. |
482 | */ | 480 | */ |
483 | void QDawg::createFromWords(const QStringList& list) | 481 | void QDawg::createFromWords(const QStringList& list) |
484 | { | 482 | { |
485 | delete d; | 483 | delete d; |
486 | 484 | ||
487 | if ( list.count() ) { | 485 | if ( list.count() ) { |
488 | QTrie* trie = new QTrie; | 486 | QTrie* trie = new QTrie; |
489 | for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) { | 487 | for (QStringList::ConstIterator it = list.begin(); it != list.end(); ++it) { |
490 | trie->insertWord(*it); | 488 | trie->insertWord(*it); |
491 | } | 489 | } |
492 | d = new QDawgPrivate(trie); | 490 | d = new QDawgPrivate(trie); |
493 | } else { | 491 | } else { |
494 | d = 0; | 492 | d = 0; |
495 | } | 493 | } |
496 | } | 494 | } |
497 | 495 | ||
498 | /*! | 496 | /*! |
499 | Returns a list of all the words in the DAWG. | 497 | Returns a list of all the words in the DAWG. |
500 | */ | 498 | */ |
501 | QStringList QDawg::allWords() const | 499 | QStringList QDawg::allWords() const |
502 | { | 500 | { |
503 | QStringList result; | 501 | QStringList result; |
504 | if ( d ) d->appendAllWords(result); | 502 | if ( d ) d->appendAllWords(result); |
505 | return result; | 503 | return result; |
506 | } | 504 | } |
507 | 505 | ||
508 | 506 | ||
509 | /*! | 507 | /*! |
510 | Replaces the DAWG with the DAWG in \a filename. | 508 | Replaces the DAWG with the DAWG in \a filename. |
511 | The file is memory-mapped. | 509 | The file is memory-mapped. |
512 | 510 | ||
513 | \sa write() | 511 | \sa write() |
514 | */ | 512 | */ |
515 | bool QDawg::readFile(const QString& filename) | 513 | bool QDawg::readFile(const QString& filename) |
516 | { | 514 | { |
517 | delete d; | 515 | delete d; |
518 | d = 0; | 516 | d = 0; |
519 | int f = ::open( QFile::encodeName(filename), O_RDONLY ); | 517 | int f = ::open( QFile::encodeName(filename), O_RDONLY ); |
520 | if ( f < 0 ) | 518 | if ( f < 0 ) |
521 | return FALSE; | 519 | return FALSE; |
522 | struct stat st; | 520 | struct stat st; |
523 | if ( !fstat( f, &st ) ) { | 521 | if ( !fstat( f, &st ) ) { |
524 | char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file | 522 | char * tmp = (char*)mmap( 0, st.st_size, // any address, whole file |
525 | PROT_READ, // read-only memory | 523 | PROT_READ, // read-only memory |
526 | MAP_FILE | MAP_PRIVATE, // swap-backed map from file | 524 | MAP_FILE | MAP_PRIVATE, // swap-backed map from file |
527 | f, 0 ); // from offset 0 of f | 525 | f, 0 ); // from offset 0 of f |
528 | if ( tmp && tmp != (char*)MAP_FAILED ) | 526 | if ( tmp && tmp != (char*)MAP_FAILED ) |
529 | d = new QDawgPrivate((uchar*)tmp); | 527 | d = new QDawgPrivate((uchar*)tmp); |
530 | } | 528 | } |
531 | ::close( f ); | 529 | ::close( f ); |
532 | return d; | 530 | return d; |
533 | } | 531 | } |
534 | 532 | ||
535 | /*! | 533 | /*! |