summaryrefslogtreecommitdiff
path: root/noncore/apps/tinykate/libkate/qt3back
authorjowenn <jowenn>2002-11-10 21:08:01 (UTC)
committer jowenn <jowenn>2002-11-10 21:08:01 (UTC)
commite97a6da57804aa14907dec327fbae71bff9b383e (patch) (unidiff)
tree15f6ee292dba24bdda72f5c72f6d2224c3516763 /noncore/apps/tinykate/libkate/qt3back
parent7c012ee8cd16d8befacc6f6750711443fac0fd5e (diff)
downloadopie-e97a6da57804aa14907dec327fbae71bff9b383e.zip
opie-e97a6da57804aa14907dec327fbae71bff9b383e.tar.gz
opie-e97a6da57804aa14907dec327fbae71bff9b383e.tar.bz2
import of tiny kate. (saving not possible yet)
Diffstat (limited to 'noncore/apps/tinykate/libkate/qt3back') (more/less context) (ignore whitespace changes)
-rw-r--r--noncore/apps/tinykate/libkate/qt3back/README12
-rw-r--r--noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp3767
-rw-r--r--noncore/apps/tinykate/libkate/qt3back/qregexp3.h111
3 files changed, 3890 insertions, 0 deletions
diff --git a/noncore/apps/tinykate/libkate/qt3back/README b/noncore/apps/tinykate/libkate/qt3back/README
new file mode 100644
index 0000000..e329aee
--- a/dev/null
+++ b/noncore/apps/tinykate/libkate/qt3back/README
@@ -0,0 +1,12 @@
1This is a backport I got from Scott Manson, I just added an additional #ifndef QT_NO_COMPAT in the .cpp file around a match
2function
3
4*************************
5 This is ___NOT___ the original version from Trolltech, so don't blame them if something is wrong.
6*************************
7
8Use it at your own risk, neither Trolltech, Scott Manson nor I take any responsibility, if it damages your system or causes
9unexpected behaviour or causes loss of data
10
11Joseph Wenninger
12<jowenn@kde.org>
diff --git a/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp b/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp
new file mode 100644
index 0000000..a2c680f
--- a/dev/null
+++ b/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp
@@ -0,0 +1,3767 @@
1/****************************************************************************
2** $Id$
3**
4** Implementation of QRegExp class
5**
6** Created : 950126
7**
8** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37#if QT_VERSION >=300
38#error QRegExp3 is now in QT 3 use QRegExp instead
39#endif
40
41#include "qarray.h"
42#include "qbitarray.h"
43#include "qcache.h"
44#include "qintdict.h"
45#include "qmap.h"
46#if QT_VERSION < 300
47#include "./qregexp3.h"
48#else
49#include "qregexp.h"
50#endif
51#include "qstring.h"
52#include "qtl.h"
53#include "qvector.h"
54
55#include <limits.h>
56
57/*
58 WARNING! Be sure to read qregexp.tex before modifying this file.
59*/
60
61/*!
62 \class QRegExp3 qregexp.h
63
64 \brief The QRegExp class provides pattern matching using regular expressions.
65
66 \ingroup tools
67 \ingroup misc
68 \ingroup shared
69
70
71 Regular expressions, "regexps", provide a way to find patterns
72 within text. This is useful in many contexts, for example:
73
74 <ol>
75 <li>\e Validation. A regexp can be used to check whether a piece of
76 text meets some criteria, e.g. is an integer or contains no
77 whitespace.
78 <li>\e Searching. Regexps provide a much more powerful means of
79 searching text than simple string matching does. For example we can
80 create a regexp which says "find one of the words 'mail', 'letter'
81 or 'correspondence' but not any of the words 'email', 'mailman'
82 'mailer', 'letterbox' etc."
83 <li><em>Search and Replace.</em> A regexp can be used to replace a
84 pattern with a piece of text, for example replace all occurrences of
85 '&' with '\&amp;' except where the '&' is already followed by
86 'amp;'.
87 <li><em>String Splitting.</em> A regexp can be used to identify
88 where a string should be split into its component fields, e.g.
89 splitting tab delimited strings.
90 </ol>
91
92 We present a very brief introduction to regexps, a description of
93 Qt's regexp language, some code examples, and finally the function
94 documentation. QRegExp is modelled on Perl's regexp engine and fully
95 supports Unicode. QRegExp may also be used in the weaker 'wildcard'
96 (globbing) mode which works in a similar way to command shells. A
97 good text on regexps is <i>Mastering Regular Expressions: Powerful
98 Techniques for Perl and Other Tools</i> by Jeffrey E. Friedl, ISBN
99 1565922573.
100
101 Experienced regexp users may prefer to skip the introduction and
102 go directly to the relevant information:
103
104 <ul>
105 <li><a href="#characters-and-abbreviations-for-sets-of-characters">
106 Characters and Abbreviations for Sets of Characters</a>
107 <li><a href="#sets-of-characters">Sets of Characters</a>
108 <li><a href="#quantifiers">Quantifiers</a>
109 <li><a href="#capturing-text">Capturing Text</a>
110 <li><a href="#assertions">Assertions</a>
111 <li><a href="#wildcard-matching">Wildcard Matching (globbing)</a>
112 <li><a href="#perl-users">Notes for Perl Users</a>
113 <li><a href="#code-examples">Code Examples</a>
114 <li><a href="#member-function-documentation">Member Function Documentation</a>
115 </ul>
116
117 <b>Introduction</b>
118
119 Regexps are built up from expressions, quantifiers and assertions.
120 The simplest form of expression is simply a character, e.g. <b>x</b>
121 or <b>5</b>. An expression can also be a set of characters. For
122 example, <b>[ABCD]</b>, will match an <b>A</b> or a <b>B</b> or a
123 <b>C</b> or a <b>D</b>. As a shorthand we could write this as
124 <b>[A-D]</b>. If we want to match any of the captital letters in the
125 English alphabet we can write <b>[A-Z]</b>. A quantifier tells the
126 regexp engine how many occurrences of the expression we want, e.g.
127 <b>x{1,1}</b> means match an <b>x</b> which occurs at least once and
128 at most once. We'll look at assertions and more complex expressions
129 later. Note that regexps cannot be used to check for balanced
130 brackets or tags (unless you know the maximum level of nesting).
131
132
133 We'll start by writing a regexp to match integers in the range 0 to
134 99. We will require at least one digit so we will start with
135 <b>[0-9]{1,1}</b> which means match a digit exactly once. This
136 regexp alone will match integers in the range 0 to 9. To match one
137 or two digits we can increase the maximum number of occurrences so
138 the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at least
139 once and at most twice. However, this regexp as it stands will not
140 match correctly. This regexp will match one or two digits \e within
141 a string. To ensure that we match against the whole string we must
142 use the anchor assertions. We need <b>^</b> (caret) which when it is
143 the first character in the regexp means that the regexp must match
144 from the beginning of the string. And we also need <b>$</b> (dollar)
145 which when it is the last character in the regexp means that the
146 regexp must match until the end of the string. So now our regexp is
147 <b>^[0-9]{1,2}$</b>. Note that assertions do not match any
148 characters.
149
150 If you've seen regexps elsewhere they may have looked different from
151 the one above. This is because some sets of characters and some
152 quantifiers are so common that they have special symbols to
153 represent them. <b>[0-9]</b> can be replaced with the symbol
154 <b>\d</b>. The quantifier to match exactly one occurrence,
155 <b>{1,1}</b>, can be replaced with the expression itself. This means
156 that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
157 to 99 matcher could be written: <b>^\d{1,2}$</b>, although most
158 people would write it <b>^\d\d?$</b>. The <b>?</b> is the same as
159 the quantifier <b>{0,1}</b>, i.e. a minimum of no occurrences a
160 maximum of one occurrence. This is used to make an expression
161 optional. The regexp <b>^\d\d?$</b> means "from the beginning of the
162 string match one digit followed by zero or one digits and then the
163 end of the string".
164
165 Our second example is matching the words 'mail', 'letter' or
166 'correspondence' but without matching 'email', 'mailman', 'mailer',
167 'letterbox' etc. We'll start by just matching 'mail'. In full the
168 regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since an expression
169 itself is automatically quantified by <b>{1,1}</b> we can simply
170 write this as <b>mail</b>; an 'm' followed by an 'a' followed by an
171 'i' followed by an 'l'. The symbol '|' (bar) is used for \e
172 alternation, so our regexp now becomes
173 <b>mail|letter|correspondence</b> which means match 'mail' \e or
174 'letter' \e or 'correspondence'. Whilst this regexp will find the
175 words we want it will also find words we don't want such as 'email'.
176 We will start by putting our regexp in parenthesis
177 <b>(mail|letter|correspondence)</b>. Parenthesis have two effects,
178 firstly they group expressions together and secondly they identify
179 parts of the regexp that we wish to <a href="#capturing-text">capture</a>.
180 Our regexp still matches any of the three words but now they are
181 grouped together as a unit. This is useful for building up more
182 complex regexps. It is also useful because it allows us to examine
183 which of the words actually matched. We need to use another
184 assertion, this time <b>\b</b> "word boundary":
185 <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match a
186 word boundary followed by the expression in parenthesis followed by
187 another word boundary". The <b>\b</b> assertion matches at a \e
188 position in the regexp not a \e character in the regexp. A word
189 boundary is any non-word character such as a space a newline or the
190 beginning or end of the string.
191
192 For our third example we want to replace ampersands with the HTML
193 entity '\&amp;'. The regexp to match is simple: <b>\&</b>, i.e.
194 match one ampersand. Unfortunately this will mess up our text if
195 some of the ampersands have already been turned into HTML entities.
196 So what we really want to say is replace an ampersand providing it
197 is not followed by 'amp;'. For this we need the negative lookahead
198 assertion and our regexp becomes: <b>\&(?!amp;)</b>. The negative
199 lookahead assertion is introduced with '(?!' and finishes at the
200 ')'. It means that the text it contains, 'amp;' in our example, must
201 \e not follow the expression that preceeds it.
202
203 Regexps provide a rich language that can be used in a variety of
204 ways. For example suppose we want to count all the occurrences of
205 'Eric' and 'Eirik' in a string. Two valid regexps to match these are
206 <b>\\</b><b>b(Eric|Eirik)</b><b>\\</b><b>b</b> and
207 <b>\\</b><b>bEi?ri[ck]</b><b>\\</b><b>b</b>. We need the word boundary
208 '\b' so we don't get 'Ericsson' etc. The second regexp actually
209 matches more than we want, 'Eric', 'Erik', 'Eiric' and 'Eirik'.
210
211 We will implement some the examples above in the
212 <a href="#code-examples">code examples</a> section.
213
214 <a name="characters-and-abbreviations-for-sets-of-characters">
215 <b>Characters and Abbreviations for Sets of Characters</b></a>
216
217 <ul>
218
219 <li><b>c</b> Any character represents itself unless it has a special regexp
220 meaning. Thus <b>c</b> matches the character \e c.
221
222 <li><b>\\</b><b>c</b> A character that follows a backslash matches the
223 character itself except where mentioned below. For example if you
224 wished to match a literal caret at the beginning of a string you
225 would write <b>\^</b>.
226
227 <li><b>\\</b><b>a</b> This matches the ASCII bell character (BEL, 0x07).
228 <li><b>\\</b><b>f</b> This matches the ASCII form feed character (FF, 0x0C).
229 <li><b>\\</b><b>n</b> This matches the ASCII line feed character (LF, 0x0A), (Unix newline).
230 <li><b>\\</b><b>r</b> This matches the ASCII carriage return character (CR, 0x0D).
231 <li><b>\\</b><b>t</b> This matches the ASCII horizontal tab character (HT, 0x09).
232 <li><b>\\</b><b>v</b> This matches the ASCII vertical tab character (VT, 0x0B).
233 <li><b>\\</b><b>xhhhh</b> This matches the Unicode character corresponding
234 to the hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
235 (i.e., \zero ooo) matches the ASCII/Latin-1 character corresponding
236 to the octal number ooo (between 0 and 0377).
237 <li><b>. (dot)</b> This matches any character (including newline).
238 <li><b>\\</b><b>d</b> This matches a digit (see QChar::isDigit()).
239 <li><b>\\</b><b>D</b> This matches a non-digit.
240 <li><b>\\</b><b>s</b> This matches a whitespace (see QChar::isSpace()).
241 <li><b>\\</b><b>S</b> This matches a non-whitespace.
242 <li><b>\\</b><b>w</b> This matches a word character (see QChar::isLetterOrNumber()).
243 <li><b>\\</b><b>W</b> This matches a non-word character.
244 <li><b>\\</b><b>n</b> The n<sup>th</sup>
245 <a href="#capturing-text">backreference</a>, e.g. \1, \2, etc.
246 </ul>
247
248 <em>Note that the C++ compiler transforms backslashes in strings so
249 to include a <b>\\</b> in a regexp you will need to enter it twice,
250 i.e. <b>\\</b><b>\\</b>.</em>
251
252 <a name="sets-of-characters"><b>Sets of Characters</b></a>
253
254 Square brackets are used to match any character in the set of
255 characters contained within the square brackets. All the character
256 set abbreviations described above can be used within square
257 brackets. Apart from the character set abbreviations and the
258 following two exceptions no characters have special meanings in
259 square brackets.
260
261 <ul>
262
263 <li><b>^</b> The caret negates the character set if it occurs as the
264 first character, i.e. immediately after the opening square bracket.
265 For example, <b>[abc]</b> matches 'a' or 'b' or 'c', but
266 <b>[^abc]</b> matches anything \e except 'a' or 'b' or 'c'.
267
268 <li><b>-</b> The dash is used to indicate a range of characters, for
269 example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
270
271 </ul>
272
273 Using the predefined character set abbreviations is more portable
274 than using character ranges across platforms and languages. For
275 example, <b>[0-9]</b> matches a digit in Western alphabets but
276 <b>\d</b> matches a digit in \e any alphabet.
277
278 Note that in most regexp literature sets of characters are called
279 "character classes".
280
281 <a name="quantifiers"><b>Quantifiers</b></a>
282
283 By default an expression is automatically quantified by
284 <b>{1,1}</b>, i.e. it should occur exactly once. In the following
285 list <b><i>E</i></b> stands for any expression. An expression is a
286 character or an abbreviation for a set of characters or a set of
287 characters in square brackets or any parenthesised expression.
288
289 <ul>
290
291 <li><b><i>E</i>?</b> Matches zero or one occurrence of <i>E</i>.
292 This quantifier means "the previous expression is optional" since it
293 will match whether or not the expression occurs in the string.
294 It is the same as <b><i>E</i>{0,1}</b>. For example <b>dents?</b>
295 will match 'dent' and 'dents'.
296
297 <li><b><i>E</i>+</b> Matches one or more occurrences of <i>E</i>.
298 This is the same as <b><i>E</i>{1,MAXINT}</b>. For example,
299 <b>0+</b> will match '0', '00', '000', etc.
300
301 <li><b><i>E</i>*</b> Matches zero or more occurrences of <i>E</i>.
302 This is the same as <b><i>E</i>{0,MAXINT}</b>. The <b>*</b>
303 quantifier is often used by a mistake. Since it matches \e zero or
304 more occurrences it will match no occurrences at all. For example if
305 we want to match strings that end in whitespace and use the regexp
306 <b>\s*$</b> we would get a match on every string. This is because we
307 have said find zero or more whitespace followed by the end of string,
308 so even strings that don't end in whitespace will match. The regexp
309 we want in this case is <b>\s+$</b> to match strings that have at
310 least one whitespace at the end.
311
312 <li><b><i>E</i>{n}</b> Matches exactly \e n occurrences of the
313 expression. This is the same as repeating the expression \e n times.
314 For example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also the
315 same as <b><i>E</i>{n,n}</b>, e.g. <b>x{5,5}</b>.
316
317 <li><b><i>E</i>{n,}</b> Matches at least \e n occurrences of the
318 expression. This is the same as <b><i>E</i>{n,MAXINT}</b>.
319
320 <li><b><i>E</i>{,m}</b> Matches at most \e m occurrences of the
321 expression. This is the same as <b><i>E</i>{0,m}</b>.
322
323 <li><b><i>E</i>{n,m}</b> Matches at least \e n occurrences of the
324 expression and at most \e m occurrences of the expression.
325
326 </ul>
327
328 (MAXINT is implementation dependent but will not be smaller than
329 16384.)
330
331 If we wish to apply a quantifier to more than just the preceeding
332 character we can use parenthesis to group characters together in an
333 expression. For example, <b>tag+</b> matches a 't' followed by an
334 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches at
335 least one occurrence of 'tag'.
336
337 Note that quantifiers are "greedy", they will match as much text as
338 they can. For example, <b>0+</b> will match as many zeros as it can
339 from the first zero it finds, e.g. '2.<u>000</u>5'. Quantifiers can
340 be made non-greedy, see setMinimal().
341
342 <a name="capturing-text"><b>Capturing Text</b></a>
343
344 Parenthesis allow us to group elements together so that we can
345 quantify and capture them. For example if we have the expression
346 <b>mail|letter|correspondence</b> that matches a string we know that
347 \e one of the words matched but not which one. Using parenthesis
348 allows us to "capture" whatever is matched within their bounds, so
349 if we used <b>(mail|letter|correspondence)</b> and matched this
350 regexp against the string "I sent you some email" we can use the
351 cap() or capturedTexts() functions to extract the matched
352 characters, in this case 'mail'.
353
354 We can use captured text within the regexp itself. To refer to the
355 captured text we use \e backreferences which are indexed from 1 the
356 same as for cap(). For example we could search for duplicate words
357 in a string using <b>\b(\w+)\W+\1\b</b> which means match a word
358 boundary followed by one or more word characters followed by one or
359 more non-word characters followed by the same text as the first
360 parenthesised expression followed by a word boundary.
361
362 If we want to use parenthesis purely for grouping and not for
363 capturing we use the non-capturing syntax, e.g.
364 <b>(?:green|blue)</b>. Non-capturing parenthesis begin '(?:' and end
365 ')'. In this example we match either 'green' or 'blue' but we do not
366 capture the match so we can only know whether or not we matched but
367 not which color we actually found. Using non-capturing parenthesis
368 is more efficient than using capturing parenthesis since the regexp
369 engine has to do less book-keeping.
370
371 Both capturing and non-capturing parenthesis may be nested.
372
373 <a name="assertions"><b>Assertions</b></a>
374
375 Assertions make some statement about the text at the point where
376 they occur in the regexp but they do not match any characters.
377 In the following list <b><i>E</i></b> stands for any expression.
378
379 <ul>
380 <li><b>^</b> If the caret is the first character in the regexp
381 (apart from opening parenthesis) it signifies the beginning of the
382 string. It has no special meaning elsewhere (except as the first
383 character of a set of characters in square brackets). For example,
384 <b>^#include</b> will only match strings which \e begin with the
385 characters '#include'.
386
387 <li><b>$</b> If the dollar is the last character in the regexp
388 (apart from closing parenthesis) it signifies the end of the string.
389 It has no special meaning elsewhere. For example, <b>\d\s*$</b>
390 will match strings which end with a digit optionally followed by
391 whitespace.
392
393 <li><b>\\</b><b>b</b> A word boundary. For example the regexp
394 <b>\\</b><b>bOK</b>\\</b><b>b</b> means match immediately after a
395 word boundary (e.g. start of string or whitespace) the letter 'O'
396 then the letter 'K' immediately before another word boundary (e.g.
397 end of string or whitespace). But note that the assertion does not
398 actually match any whitespace so if we write
399 <b>(</b><b>\\</b><b>bOK</b>\\</b><b>b)</b> and we have a match it
400 will only contain 'OK' even if the string is "Its <u>OK</u> now".
401
402 <li><b>\\</b><b>B</b> A non-word boundary. This assertion is true
403 wherever <b>\\</b><b>b</b> is false. For example if we searched for
404 <b>\\</b><b>Bon</b>\\</b><b>B</b> in "Left on" the match would fail
405 (space and end of string aren't non-word boundaries), but it would
406 match in "t<u>on</u>ne".
407
408 <li><b>(?=<i>E</i>)</b> Positive lookahead. This assertion is true
409 if the expression matches at this point in the regex. This assertion
410 does not match any characters. For example,
411 <b>^#define\s+(\w+)(?=MAX)</b> will match strings which begin with
412 '#define' followed by at least one whitespace followed by at least
413 one word character followed by 'MAX'. The first set of parenthesis
414 will capture the word character(s) matched. This regexp will not
415 match '#define DEBUG' but will match '#define <u>INT</u>MAX
416 32767'.
417
418 <li><b>(?!<i>E</i>)</b> Negative lookahead. This assertion is true
419 if the expression does not match at this point in the regex. This
420 assertion does not match any characters. For example,
421 <b>^#define\s+(\w+)\s*$</b> will match strings which begin with
422 '#define' followed by at least one whitespace followed by at least
423 one word character optionally followed by whitespace. This regexp
424 will match define's that exist but have no value, i.e. it will not
425 match '#define INTMAX 32767' but it will match '#define <u>DEBUG</u>'.
426
427 </ul>
428
429 <a name="wildcard-matching"><b>Wildcard Matching (globbing)</b></a>
430
431 Most command shells such as \e bash or \e cmd support "file
432 globbing", the ability to identify a group of files by using
433 wildcards. Wildcard matching is much simpler than full regexps and
434 has only four features:
435
436 <ul>
437
438 <li><b>c</b> Any character represents itself apart from those
439 mentioned below. Thus <b>c</b> matches the character \e c.
440
441 <li><b>?</b> This matches any single character. It is the same as
442 <b>.</b> in full regexps.
443
444 <li><b>*</b> This matches zero or more of any characters. It is the
445 same as <b>.*</b> in full regexps.
446
447 <li><b>[...]</b> Sets of characters can be represented in square
448 brackets the same as for full regexps.
449
450 <!-- JASMIN: Are the character classes, \w, etc supported in
451 wildcards? -->
452
453 </ul>
454
455 For example if we are in wildcard mode and have strings which
456 contain filenames we could identify HTML files with <b>*.html</b>.
457 This will match zero or more characters followed by a dot followed
458 by 'h', 't', 'm' and 'l'.
459
460 <a name="perl-users"><b>Notes for Perl Users</b></a>
461
462 Most of the character class abbreviations supported by Perl are
463 supported by QRegExp, see
464 <a href="#characters-and-abbreviations-for-sets-of-characters">
465 characters and abbreviations for sets of characters</a>.
466
467 QRegExp's quantifiers are the same as Perl's greedy quantifiers.
468 Non-greedy matching cannot be applied to individual quantifiers, but
469 can be applied to all the quantifiers in the pattern. For example,
470 to match the Perl regex <b>ro+?m</b> requires:
471 \code
472 QRegExp rx( "ro+m" );
473 rx.setMinimal( TRUE );
474 \endcode
475
476 The equivalent of Perl's <tt>/i</tt> option is
477 setCaseSensitive(FALSE).
478
479 Perl's <tt>/g</tt> option can be emulated using a
480 <a href="#cap_in_a_loop">loop</a>.
481
482 In QRegExp <b>.</b> matches any character, therefore all QRegExp
483 regexps have the equivalent of Perl's <tt>/s</tt> option. QRegExp
484 does not have an equivalent to Perl's <tt>/m</tt> option, but this
485 can be emulated in various ways for example by splitting the input
486 into lines or by looping with a regexp that searches for newlines.
487
488 Because QRegExp is string oriented there are no \A, \Z or \z
489 assertions. The \G assertion is not supported but can be emulated in
490 a loop.
491
492 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
493 equivalents for $`, $' or $+. $1, $2 etc correspond to
494 cap(1) or capturedTexts()[1], cap(2) or capturedTexts()[2], etc.
495
496 To substitute a pattern use QString::replace().
497
498 Perl's extended <tt>/x</tt> syntax is not supported, nor are regexp
499 comments (?#comment) or directives, e.g. (?i).
500
501 Both zero-width positive and zero-width negative lookahead
502 assertions (?=pattern) and (?!pattern) are supported with the same
503 syntax as Perl. Perl's lookbehind assertions, "independent"
504 subexpressions and conditional expressions are not supported.
505
506 Non-capturing parenthesis are also supported, with the same
507 (?:pattern) syntax.
508
509 See QStringList::split() and QStringList::join() for equivalents to
510 Perl's split and join functions.
511
512 Note: because C++ transforms \\'s they must be written \e twice in
513 code, e.g. <b>\\</b><b>b</b> must be written <b>\\</b><b>\\</b><b>b</b>.
514
515 <a name="code-examples"><b>Code Examples</b></a>
516
517 \code
518 QRegExp rx( "^\\d\\d?$" );// Match integers 0 to 99
519 rx.search( "123" ); // Returns -1 (no match)
520 rx.search( "-6" ); // Returns -1 (no match)
521 rx.search( "6" ); // Returns 0 (matched as position 0)
522 \endcode
523
524 The third string matches '<u>6</u>'. This is a simple validation
525 regexp for integers in the range 0 to 99.
526
527 \code
528 QRegExp rx( "^\\S+$" );// Match strings which have no whitespace
529 rx.search( "Hello world" );// Returns -1 (no match)
530 rx.search( "This_is-OK" );// Returns 0 (matched at position 0)
531 \endcode
532
533 The second string matches '<u>This_is-OK</u>'. We've used the
534 character set abbreviation '\S' (non-whitespace) and the anchors to
535 match strings which contain no whitespace.
536
537 In the following example we match strings containing 'mail' or
538 'letter' or 'correspondence' but only match whole words i.e. not
539 'email'
540
541 \code
542 QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
543 rx.search( "I sent you an email" ); // Returns -1 (no match)
544 rx.search( "Please write the letter" ); // Returns 17 (matched at position 17)
545 \endcode
546
547 The second string matches "Please write the <u>letter</u>". The word
548 'letter' is also captured (because of the parenthesis). We can see
549 what text we've captured like this:
550
551 \code
552 QString captured = rx.cap( 1 ); // captured contains "letter"
553 \endcode
554
555 This will capture the text from the first set of capturing
556 parenthesis (counting capturing left parenthesis from left to
557 right). The parenthesis are counted from 1 since cap( 0 ) is the
558 whole matched regexp (equivalent to '&' in most regexp engines).
559
560 \code
561 QRegExp rx( "&(?!amp;)" ); // Match ampersands but not &amp;
562 QString line1 = "This & that";
563 line1.replace( rx, "&amp;" );
564 // line1 == "This &amp; that"
565 QString line2 = "His &amp; hers & theirs";
566 line2.replace( rx, "&amp;" );
567 // line2 == "His &amp; hers &amp; theirs"
568 \endcode
569
570 Here we've passed the QRegExp to QString's replace() function to
571 replace the matched text with new text.
572
573 \code
574 QString str = "One Eric another Eirik, and an Ericsson. How many Eiriks, Eric?";
575 QRegExp rx( "\\b(Eric|Eirik)\\b" );// Match Eric or Eirik
576 int pos = 0; // Where we are in the string
577 int count = 0; // How many Eric and Eirik's we've counted
578 while ( pos >= 0 ) {
579 pos = rx.search( str, pos );
580 if ( pos >= 0 ) {
581 pos++;// Move along in str
582 count++;// Count our Eric or Eirik
583 }
584 }
585 \endcode
586
587 We've used the search() function to repeatedly match the regexp in
588 the string. Note that instead of moving forward by one character at
589 a time <tt>pos++</tt> we could have written <tt>pos +=
590 rx.matchedLength()</tt> to skip over the already matched string. The
591 count will equal 3, matching 'One <u>Eric</u> another <u>Eirik</u>,
592 and an Ericsson. How many Eiriks, <u>Eric</u>?'; it doesn't match
593 'Ericsson' or 'Eiriks' because they are not bounded by non-word
594 boundaries.
595
596 One common use of regexps is to split lines of delimited data into
597 their component fields.
598
599 \code
600 str = "Trolltech AS\twww.trolltech.com\tNorway";
601 QString company, web, country;
602 rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
603 if ( rx.search( str ) != -1 ) {
604 company = rx.cap( 1 );
605 web= rx.cap( 2 );
606 country = rx.cap( 3 );
607 }
608 \endcode
609
610 In this example our input lines have the format company name, web
611 address and country. Unfortunately the regexp is rather long and not
612 very versatile -- the code will break if we add any more fields. A
613 simpler and better solution is to look for the separator, '\t' in
614 this case, and take the surrounding text. The QStringList split()
615 function can take a separator string or regexp as an argument and
616 split a string accordingly.
617
618 \code
619 QStringList field = QStringList::split( "\t", str );
620 \endcode
621
622 Here field[0] is the company, field[1] the web address and so on.
623
624 To immitate the matching of a shell we can use wildcard mode.
625
626 \code
627 QRegExp rx( "*.html" );// Invalid regexp: * doesn't quantify anything
628 rx.setWildcard( TRUE );// Now its a valid wildcard regexp
629 rx.search( "index.html" );// Returns 0 (matched at position 0)
630 rx.search( "default.htm" );// Returns -1 (no match)
631 rx.search( "readme.txt" );// Returns -1 (no match)
632 \endcode
633
634 Wildcard matching can be convenient because of its simplicity, but
635 any wildcard regex can be defined using full regexps, e.g.
636 <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
637 .htm files with a wildcard unless we use <b>*.htm*</b> which will
638 also match 'test.html.bak'. A full regexp gives us the precision we
639 need, <b>.*\.html?$</b>.
640
641 QRegExp can match case insensitively using setCaseSensitive(), and
642 can use non-greedy matching, see setMinimal(). By default QRegExp
643 uses full regexps but this can be changed with setWildcard().
644 Searching can be forward with search() or backward with searchRev().
645 Captured text can be accessed using capturedTexts() which returns a
646 string list of all captured strings, or using cap() which returns
647 the captured string for the given index. The pos() function takes a
648 match index and returns the position in the string where the match
649 was made (or -1 if there was no match).
650
651 \sa QRegExpValidator QString QStringList
652
653 <a name="member-function-documentation"/>
654*/
655
656static const int NumBadChars = 128;
657#define BadChar( ch ) ( (ch).cell() % NumBadChars )
658
659static const int NoOccurrence = INT_MAX;
660static const int EmptyCapture = INT_MAX;
661static const int InftyLen = INT_MAX;
662static const int InftyRep = 1000;
663static const int EOS = -1;
664
665#ifndef QT_NO_REGEXP_OPTIM
666static int engCount = 0;
667static QArray<int> *noOccurrences = 0;
668static QArray<int> *firstOccurrenceAtZero = 0;
669#endif
670
671/*
672 Merges two QArrays of ints and puts the result into the first one.
673*/
674static void mergeInto( QArray<int> *a, const QArray<int>& b )
675{
676 int asize = a->size();
677 int bsize = b.size();
678 if ( asize == 0 ) {
679 *a = b.copy();
680#ifndef QT_NO_REGEXP_OPTIM
681 } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
682 a->resize( asize + 1 );
683 (*a)[asize] = b[0];
684#endif
685 } else if ( bsize >= 1 ) {
686 int csize = asize + bsize;
687 QArray<int> c( csize );
688 int i = 0, j = 0, k = 0;
689 while ( i < asize ) {
690 if ( j < bsize ) {
691 if ( (*a)[i] == b[j] ) {
692 i++;
693 csize--;
694 } else if ( (*a)[i] < b[j] ) {
695 c[k++] = (*a)[i++];
696 } else {
697 c[k++] = b[j++];
698 }
699 } else {
700 memcpy( c.data() + k, (*a).data() + i,
701 (asize - i) * sizeof(int) );
702 break;
703 }
704 }
705 c.resize( csize );
706 if ( j < bsize )
707 memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
708 *a = c;
709 }
710}
711
712/*
713 Merges two disjoint QMaps of (int, int) pairs and puts the result into the
714 first one.
715*/
716static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
717{
718 QMap<int, int>::ConstIterator it;
719 for ( it = b.begin(); it != b.end(); ++it )
720 a->insert( it.key(), *it );
721}
722
723/*
724 Returns the value associated to key k in QMap m of (int, int) pairs, or 0 if
725 no such value is explicitly present.
726*/
727static int at( const QMap<int, int>& m, int k )
728{
729 QMap<int, int>::ConstIterator it = m.find( k );
730 if ( it == m.end() )
731 return 0;
732 else
733 return *it;
734}
735
736#ifndef QT_NO_REGEXP_WILDCARD
737/*
738 Translates a wildcard pattern to an equivalent regular expression pattern
739 (e.g., *.cpp to .*\.cpp).
740*/
741static QString wc2rx( const QString& wc )
742{
743 int wclen = wc.length();
744 QString rx = QString::fromLatin1( "" );
745 int i = 0;
746 while ( i < wclen ) {
747 QChar c = wc[i++];
748 switch ( c.unicode() ) {
749 case '*':
750 rx += QString::fromLatin1( ".*" );
751 break;
752 case '?':
753 rx += QChar( '.' );
754 break;
755 case '$':
756 case '(':
757 case ')':
758 case '+':
759 case '.':
760 case '\\':
761 case '^':
762 case '{':
763 case '|':
764 case '}':
765 rx += QChar( '\\' );
766 rx += c;
767 break;
768 case '[':
769 rx += c;
770 if ( wc[i] == QChar('^') )
771 rx += wc[i++];
772 if ( i < wclen ) {
773 if ( rx[i] == ']' )
774 rx += wc[i++];
775 while ( i < wclen && wc[i] != QChar(']') ) {
776 if ( wc[i] == '\\' )
777 rx += QChar( '\\' );
778 rx += wc[i++];
779 }
780 }
781 break;
782 default:
783 rx += c;
784 }
785 }
786 return rx;
787}
788#endif
789
790/*
791 The class QRegExpEngine encapsulates a modified nondeterministic finite
792 automaton (NFA).
793*/
794class QRegExpEngine : public QShared
795{
796public:
797#ifndef QT_NO_REGEXP_CCLASS
798 /*
799 The class CharClass represents a set of characters, such as can be found
800 in regular expressions (e.g., [a-z] denotes the set {a, b, ..., z}).
801 */
802 class CharClass
803 {
804 public:
805 CharClass();
806 CharClass( const CharClass& cc ) { operator=( cc ); }
807
808 CharClass& operator=( const CharClass& cc );
809
810 void clear();
811 bool negative() const { return n; }
812 void setNegative( bool negative );
813 void addCategories( int cats );
814 void addRange( ushort from, ushort to );
815 void addSingleton( ushort ch ) { addRange( ch, ch ); }
816
817 bool in( QChar ch ) const;
818#ifndef QT_NO_REGEXP_OPTIM
819 const QArray<int>& firstOccurrence() const { return occ1; }
820#endif
821
822#if defined(QT_DEBUG)
823 void dump() const;
824#endif
825
826 private:
827 /*
828 The struct Range represents a range of characters (e.g., [0-9] denotes
829 range 48 to 57).
830 */
831 struct Range
832 {
833 ushort from; // 48
834 ushort to; // 57
835 };
836
837 int c; // character classes
838 QArray<Range> r; // character ranges
839 bool n; // negative?
840#ifndef QT_NO_REGEXP_OPTIM
841 QArray<int> occ1; // first-occurrence array
842#endif
843 };
844#else
845 struct CharClass
846 {
847 int x; // dummy
848
849#ifndef QT_NO_REGEXP_OPTIM
850 const QArray<int>& firstOccurrence() const {
851 return *firstOccurrenceAtZero;
852 }
853#endif
854 };
855#endif
856
857 QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
858 QRegExpEngine( const QString& rx, bool caseSensitive );
859#ifndef QT_NO_REGEXP_OPTIM
860 ~QRegExpEngine();
861#endif
862
863 bool isValid() const { return valid; }
864 bool caseSensitive() const { return cs; }
865 int numCaptures() const { return realncap; }
866 QArray<int> match( const QString& str, int pos, bool minimal,
867 bool oneTest );
868 int matchedLength() const { return mmMatchedLen; }
869
870 int createState( QChar ch );
871 int createState( const CharClass& cc );
872#ifndef QT_NO_REGEXP_BACKREF
873 int createState( int bref );
874#endif
875
876 void addCatTransitions( const QArray<int>& from, const QArray<int>& to );
877#ifndef QT_NO_REGEXP_CAPTURE
878 void addPlusTransitions( const QArray<int>& from, const QArray<int>& to,
879 int atom );
880#endif
881
882#ifndef QT_NO_REGEXP_ANCHOR_ALT
883 int anchorAlternation( int a, int b );
884 int anchorConcatenation( int a, int b );
885#else
886 int anchorAlternation( int a, int b ) { return a & b; }
887 int anchorConcatenation( int a, int b ) { return a | b; }
888#endif
889 void addAnchors( int from, int to, int a );
890
891#ifndef QT_NO_REGEXP_OPTIM
892 void setupGoodStringHeuristic( int earlyStart, int lateStart,
893 const QString& str );
894 void setupBadCharHeuristic( int minLen, const QArray<int>& firstOcc );
895 void heuristicallyChooseHeuristic();
896#endif
897
898#if defined(QT_DEBUG)
899 void dump() const;
900#endif
901
902private:
903 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
904
905 /*
906 The struct State represents one state in a modified NFA. The input
907 characters matched are stored in the state instead of on the transitions,
908 something possible for an automaton constructed from a regular expression.
909 */
910 struct State
911 {
912#ifndef QT_NO_REGEXP_CAPTURE
913 int atom; // which atom does this state belong to?
914#endif
915 int match; // what does it match? (see CharClassBit and BackRefBit)
916 QArray<int> outs; // out-transitions
917 QMap<int, int> *reenter; // atoms reentered when transiting out
918 QMap<int, int> *anchors; // anchors met when transiting out
919
920#ifndef QT_NO_REGEXP_CAPTURE
921 State( int a, int m )
922 : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
923#else
924 State( int m )
925 : match( m ), reenter( 0 ), anchors( 0 ) { }
926#endif
927 ~State() { delete reenter; delete anchors; }
928 };
929
930#ifndef QT_NO_REGEXP_LOOKAHEAD
931 /*
932 The struct Lookahead represents a lookahead a la Perl (e.g., (?=foo) and
933 (?!bar)).
934 */
935 struct Lookahead
936 {
937 QRegExpEngine *eng; // NFA representing the embedded regular expression
938 bool neg; // negative lookahead?
939
940 Lookahead( QRegExpEngine *eng0, bool neg0 )
941 : eng( eng0 ), neg( neg0 ) { }
942 ~Lookahead() { delete eng; }
943 };
944#endif
945
946#ifndef QT_NO_REGEXP_CAPTURE
947 /*
948 The struct Atom represents one node in the hierarchy of regular expression
949 atoms.
950 */
951 struct Atom
952 {
953 int parent; // index of parent in array of atoms
954 int capture; // index of capture, from 1 to ncap
955 };
956#endif
957
958#ifndef QT_NO_REGEXP_ANCHOR_ALT
959 /*
960 The struct AnchorAlternation represents a pair of anchors with OR
961 semantics.
962 */
963 struct AnchorAlternation
964 {
965 int a; // this anchor...
966 int b; // ...or this one
967 };
968#endif
969
970 enum { InitialState = 0, FinalState = 1 };
971 void setup( bool caseSensitive );
972 int setupState( int match );
973
974 /*
975 Let's hope that 13 lookaheads and 14 back-references are enough.
976 */
977 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
978 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
979 Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
980 Anchor_FirstLookahead = 0x00000010,
981 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
982 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
983 Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
984
985 Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
986 ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
987#ifndef QT_NO_REGEXP_CAPTURE
988 int startAtom( bool capture );
989 void finishAtom( int atom ) { cf = f[atom].parent; }
990#endif
991
992#ifndef QT_NO_REGEXP_LOOKAHEAD
993 int addLookahead( QRegExpEngine *eng, bool negative );
994#endif
995
996#ifndef QT_NO_REGEXP_CAPTURE
997 bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
998 const int *end2 );
999#endif
1000 bool testAnchor( int i, int a, const int *capBegin );
1001
1002#ifndef QT_NO_REGEXP_OPTIM
1003 bool goodStringMatch();
1004 bool badCharMatch();
1005#else
1006 bool bruteMatch();
1007#endif
1008 bool matchHere();
1009
1010 QVector<State> s; // array of states
1011 int ns; // number of states
1012#ifndef QT_NO_REGEXP_CAPTURE
1013 QArray<Atom> f; // atom hierarchy
1014 int nf; // number of atoms
1015 int cf; // current atom
1016#endif
1017 int realncap; // number of captures, seen from the outside
1018 int ncap; // number of captures, seen from the inside
1019#ifndef QT_NO_REGEXP_CCLASS
1020 QVector<CharClass> cl; // array of character classes
1021#endif
1022#ifndef QT_NO_REGEXP_LOOKAHEAD
1023 QVector<Lookahead> ahead; // array of lookaheads
1024#endif
1025#ifndef QT_NO_REGEXP_ANCHOR_ALT
1026 QArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
1027#endif
1028#ifndef QT_NO_REGEXP_OPTIM
1029 bool caretAnchored; // does the regexp start with ^?
1030#endif
1031 bool valid; // is the regular expression valid?
1032 bool cs; // case sensitive?
1033#ifndef QT_NO_REGEXP_BACKREF
1034 int nbrefs; // number of back-references
1035#endif
1036
1037#ifndef QT_NO_REGEXP_OPTIM
1038 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1039
1040 int goodEarlyStart; // the index where goodStr can first occur in a match
1041 int goodLateStart; // the index where goodStr can last occur in a match
1042 QString goodStr; // the string that any match has to contain
1043
1044 int minl; // the minimum length of a match
1045 QArray<int> occ1; // first-occurrence array
1046#endif
1047
1048 /*
1049 The class Box is an abstraction for a regular expression fragment. It can
1050 also be seen as one node in the syntax tree of a regular expression with
1051 synthetized attributes.
1052
1053 It's interface is ugly for performance reasons.
1054 */
1055 class Box
1056 {
1057 public:
1058 Box( QRegExpEngine *engine );
1059 Box( const Box& b ) { operator=( b ); }
1060
1061 Box& operator=( const Box& b );
1062
1063 void clear() { operator=(Box(eng)); }
1064 void set( QChar ch );
1065 void set( const CharClass& cc );
1066#ifndef QT_NO_REGEXP_BACKREF
1067 void set( int bref );
1068#endif
1069
1070 void cat( const Box& b );
1071 void orx( const Box& b );
1072 void plus( int atom );
1073 void opt();
1074 void catAnchor( int a );
1075#ifndef QT_NO_REGEXP_OPTIM
1076 void setupHeuristics();
1077#endif
1078
1079#if defined(QT_DEBUG)
1080 void dump() const;
1081#endif
1082
1083 private:
1084 void addAnchorsToEngine( const Box& to ) const;
1085
1086 QRegExpEngine *eng; // the automaton under construction
1087 QArray<int> ls; // the left states (firstpos)
1088 QArray<int> rs; // the right states (lastpos)
1089 QMap<int, int> lanchors; // the left anchors
1090 QMap<int, int> ranchors; // the right anchors
1091 int skipanchors; // the anchors to match if the box is skipped
1092
1093#ifndef QT_NO_REGEXP_OPTIM
1094 int earlyStart; // the index where str can first occur
1095 int lateStart; // the index where str can last occur
1096 QString str; // a string that has to occur in any match
1097 QString leftStr; // a string occurring at the left of this box
1098 QString rightStr; // a string occurring at the right of this box
1099 int maxl; // the maximum length of this box (possibly InftyLen)
1100#endif
1101
1102 int minl; // the minimum length of this box
1103#ifndef QT_NO_REGEXP_OPTIM
1104 QArray<int> occ1; // first-occurrence array
1105#endif
1106 };
1107 friend class Box;
1108
1109 /*
1110 This is the lexical analyzer for regular expressions.
1111 */
1112 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
1113 Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
1114 Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
1115 Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1116 int getChar();
1117 int getEscape();
1118#ifndef QT_NO_REGEXP_INTERVAL
1119 int getRep( int def );
1120#endif
1121#ifndef QT_NO_REGEXP_LOOKAHEAD
1122 void skipChars( int n );
1123#endif
1124 void startTokenizer( const QChar *rx, int len );
1125 int getToken();
1126
1127 const QChar *yyIn; // a pointer to the input regular expression pattern
1128 int yyPos0; // the position of yyTok in the input pattern
1129 int yyPos; // the position of the next character to read
1130 int yyLen; // the length of yyIn
1131 int yyCh; // the last character read
1132 CharClass *yyCharClass; // attribute for Tok_CharClass tokens
1133 int yyMinRep; // attribute for Tok_Quantifier
1134 int yyMaxRep; // ditto
1135 bool yyError; // syntax error or overflow during parsing?
1136
1137 /*
1138 This is the syntactic analyzer for regular expressions.
1139 */
1140 int parse( const QChar *rx, int len );
1141 void parseAtom( Box *box );
1142 void parseFactor( Box *box );
1143 void parseTerm( Box *box );
1144 void parseExpression( Box *box );
1145
1146 int yyTok; // the last token read
1147 bool yyMayCapture; // set this to FALSE to disable capturing
1148
1149 /*
1150 This is the engine state during matching.
1151 */
1152 const QString *mmStr; // a pointer to the input QString
1153 const QChar *mmIn; // a pointer to the input string data
1154 int mmPos; // the current position in the string
1155 int mmLen; // the length of the input string
1156 bool mmMinimal; // minimal matching?
1157 QArray<int> mmCaptured; // an array of pairs (start, len)
1158 QArray<int> mmCapturedNoMatch; // an array of pairs (-1, -1)
1159 QArray<int> mmBigArray; // big QArray<int> array
1160 int *mmInNextStack; // is state is mmNextStack?
1161 int *mmCurStack; // stack of current states
1162 int *mmNextStack; // stack of next states
1163 int *mmCurCapBegin; // start of current states' captures
1164 int *mmNextCapBegin; // start of next states' captures
1165 int *mmCurCapEnd; // end of current states' captures
1166 int *mmNextCapEnd; // end of next states' captures
1167 int *mmTempCapBegin; // start of temporary captures
1168 int *mmTempCapEnd; // end of temporary captures
1169 int *mmCapBegin; // start of captures for a next state
1170 int *mmCapEnd; // end of captures for a next state
1171 int *mmSlideTab; // bump-along slide table for bad-character heuristic
1172 int mmSlideTabSize; // size of slide table
1173#ifndef QT_NO_REGEXP_BACKREF
1174 QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
1175#endif
1176 int mmMatchedLen; // length of match or of matched string for partial match
1177};
1178
1179QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
1180#ifndef QT_NO_REGEXP_BACKREF
1181 : mmSleeping( 101 )
1182#endif
1183{
1184 setup( caseSensitive );
1185 valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
1186}
1187
1188#ifndef QT_NO_REGEXP_OPTIM
1189QRegExpEngine::~QRegExpEngine()
1190{
1191 if ( --engCount == 0 ) {
1192 delete noOccurrences;
1193 noOccurrences = 0;
1194 delete firstOccurrenceAtZero;
1195 firstOccurrenceAtZero = 0;
1196 }
1197}
1198#endif
1199
1200/*
1201 Tries to match in str and returns an array of (begin, length) pairs for
1202 captured text. If there is no match, all pairs are (-1, -1).
1203*/
1204QArray<int> QRegExpEngine::match( const QString& str, int pos, bool minimal,
1205 bool oneTest )
1206{
1207 mmStr = &str;
1208 mmIn = str.unicode();
1209 if ( mmIn == 0 )
1210 mmIn = &QChar::null;
1211 mmPos = pos;
1212 mmLen = str.length();
1213 mmMinimal = minimal;
1214 mmMatchedLen = 0;
1215
1216 bool matched = FALSE;
1217 if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
1218#ifndef QT_NO_REGEXP_OPTIM
1219 if ( mmPos <= mmLen - minl ) {
1220 if ( caretAnchored || oneTest )
1221 matched = matchHere();
1222 else if ( useGoodStringHeuristic )
1223 matched = goodStringMatch();
1224 else
1225 matched = badCharMatch();
1226 }
1227#else
1228 matched = oneTest ? matchHere() : bruteMatch();
1229#endif
1230 }
1231
1232 if ( matched ) {
1233 mmCaptured.detach();
1234 mmCaptured[0] = mmPos;
1235 mmCaptured[1] = mmMatchedLen;
1236 for ( int j = 0; j < realncap; j++ ) {
1237 int len = mmCapEnd[j] - mmCapBegin[j];
1238 mmCaptured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
1239 mmCaptured[2 + 2 * j + 1] = len;
1240 }
1241 return mmCaptured;
1242 } else {
1243 return mmCapturedNoMatch;
1244 }
1245}
1246
1247/*
1248 The three following functions add one state to the automaton and return the
1249 number of the state.
1250*/
1251
1252int QRegExpEngine::createState( QChar ch )
1253{
1254 return setupState( ch.unicode() );
1255}
1256
1257int QRegExpEngine::createState( const CharClass& cc )
1258{
1259#ifndef QT_NO_REGEXP_CCLASS
1260 int n = cl.size();
1261 cl.resize( n + 1 );
1262 cl.insert( n, new CharClass(cc) );
1263 return setupState( CharClassBit | n );
1264#else
1265 Q_UNUSED( cc );
1266 return setupState( CharClassBit );
1267#endif
1268}
1269
1270#ifndef QT_NO_REGEXP_BACKREF
1271int QRegExpEngine::createState( int bref )
1272{
1273 if ( bref > nbrefs ) {
1274 nbrefs = bref;
1275 if ( nbrefs > MaxBackRefs ) {
1276 yyError = TRUE;
1277 return 0;
1278 }
1279 }
1280 return setupState( BackRefBit | bref );
1281}
1282#endif
1283
1284/*
1285 The two following functions add a transition between all pairs of states
1286 (i, j) where i is fond in from, and j is found in to.
1287
1288 Cat-transitions are distinguished from plus-transitions for capturing.
1289*/
1290
1291void QRegExpEngine::addCatTransitions( const QArray<int>& from,
1292 const QArray<int>& to )
1293{
1294 for ( int i = 0; i < (int) from.size(); i++ ) {
1295 State *st = s[from[i]];
1296 mergeInto( &st->outs, to );
1297 }
1298}
1299
1300#ifndef QT_NO_REGEXP_CAPTURE
1301void QRegExpEngine::addPlusTransitions( const QArray<int>& from,
1302 const QArray<int>& to, int atom )
1303{
1304 for ( int i = 0; i < (int) from.size(); i++ ) {
1305 State *st = s[from[i]];
1306 QArray<int> oldOuts = st->outs.copy();
1307 mergeInto( &st->outs, to );
1308 if ( f[atom].capture >= 0 ) {
1309 if ( st->reenter == 0 )
1310 st->reenter = new QMap<int, int>;
1311 for ( int j = 0; j < (int) to.size(); j++ ) {
1312 if ( !st->reenter->contains(to[j]) &&
1313 oldOuts.bsearch(to[j]) < 0 )
1314 st->reenter->insert( to[j], atom );
1315 }
1316 }
1317 }
1318}
1319#endif
1320
1321#ifndef QT_NO_REGEXP_ANCHOR_ALT
1322/*
1323 Returns an anchor that means a OR b.
1324*/
1325int QRegExpEngine::anchorAlternation( int a, int b )
1326{
1327 if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
1328 return a & b;
1329
1330 int n = aa.size();
1331 aa.resize( n + 1 );
1332 aa[n].a = a;
1333 aa[n].b = b;
1334 return Anchor_Alternation | n;
1335}
1336
1337/*
1338 Returns an anchor that means a AND b.
1339*/
1340int QRegExpEngine::anchorConcatenation( int a, int b )
1341{
1342 if ( ((a | b) & Anchor_Alternation) == 0 )
1343 return a | b;
1344 if ( (b & Anchor_Alternation) != 0 )
1345 qSwap( a, b );
1346 int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
1347 int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
1348 return anchorAlternation( aprime, bprime );
1349}
1350#endif
1351
1352/*
1353 Adds anchor a on a transition caracterised by its from state and its to state.
1354*/
1355void QRegExpEngine::addAnchors( int from, int to, int a )
1356{
1357 State *st = s[from];
1358 if ( st->anchors == 0 )
1359 st->anchors = new QMap<int, int>;
1360 if ( st->anchors->contains(to) )
1361 a = anchorAlternation( (*st->anchors)[to], a );
1362 st->anchors->insert( to, a );
1363}
1364
1365#ifndef QT_NO_REGEXP_OPTIM
1366/*
1367 The two following functions provide the engine with the information needed by
1368 its matching heuristics.
1369*/
1370
1371void QRegExpEngine::setupGoodStringHeuristic( int earlyStart, int lateStart,
1372 const QString& str )
1373{
1374 goodEarlyStart = earlyStart;
1375 goodLateStart = lateStart;
1376 goodStr = cs ? str : str.lower();
1377}
1378
1379void QRegExpEngine::setupBadCharHeuristic( int minLen,
1380 const QArray<int>& firstOcc )
1381{
1382 minl = minLen;
1383 occ1 = cs ? firstOcc : *firstOccurrenceAtZero;
1384}
1385
1386/*
1387 This function chooses between the good-string and the bad-character
1388 heuristics. It computes two scores and chooses the heuristic with the highest
1389 score.
1390
1391 Here are some common-sense constraints on the scores that should be respected
1392 if the formulas are ever modified: (1) If goodStr is empty, the good-string
1393 heuristic scores 0. (2) If the search is case insensitive, the good-string
1394 heuristic should be used, unless it scores 0. (Case insensitivity
1395 turns all entries of occ1 to 0.) (3) If (goodLateStart - goodEarlyStart) is
1396 big, the good-string heuristic should score less.
1397*/
1398void QRegExpEngine::heuristicallyChooseHeuristic()
1399{
1400 int i;
1401
1402 if ( minl == 0 )
1403 return;
1404
1405 /*
1406 Magic formula: The good string has to constitute a good proportion of the
1407 minimum-length string, and appear at a more-or-less known index.
1408 */
1409 int goodStringScore = ( 64 * goodStr.length() / minl ) -
1410 ( goodLateStart - goodEarlyStart );
1411
1412 /*
1413 Less magic formula: We pick a couple of characters at random, and check
1414 whether they are good or bad.
1415 */
1416 int badCharScore = 0;
1417 int step = QMAX( 1, NumBadChars / 32 );
1418 for ( i = 1; i < NumBadChars; i += step ) {
1419 if ( occ1[i] == NoOccurrence )
1420 badCharScore += minl;
1421 else
1422 badCharScore += occ1[i];
1423 }
1424 badCharScore /= minl;
1425
1426 useGoodStringHeuristic = ( goodStringScore > badCharScore );
1427}
1428#endif
1429
1430#if defined(QT_DEBUG)
1431void QRegExpEngine::dump() const
1432{
1433 int i, j;
1434 qDebug( "Case %ssensitive engine", cs ? "" : "in" );
1435 qDebug( " States" );
1436 for ( i = 0; i < ns; i++ ) {
1437 qDebug( " %d%s", i,
1438 i == InitialState ? " (initial)" :
1439 i == FinalState ? " (final)" : "" );
1440#ifndef QT_NO_REGEXP_CAPTURE
1441 qDebug( " in atom %d", s[i]->atom );
1442#endif
1443 int m = s[i]->match;
1444 if ( (m & CharClassBit) != 0 ) {
1445 qDebug( " match character class %d", m ^ CharClassBit );
1446#ifndef QT_NO_REGEXP_CCLASS
1447 cl[m ^ CharClassBit]->dump();
1448#else
1449 qDebug( " negative character class" );
1450#endif
1451 } else if ( (m & BackRefBit) != 0 ) {
1452 qDebug( " match back-reference %d", m ^ BackRefBit );
1453 } else if ( m >= 0x20 && m <= 0x7e ) {
1454 qDebug( " match 0x%.4x (%c)", m, m );
1455 } else {
1456 qDebug( " match 0x%.4x", m );
1457 }
1458 for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
1459 int next = s[i]->outs[j];
1460 qDebug( " -> %d", next );
1461 if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
1462 qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
1463 if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
1464 qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
1465 }
1466 }
1467#ifndef QT_NO_REGEXP_CAPTURE
1468 if ( nf > 0 ) {
1469 qDebug( " Atom Parent Capture" );
1470 for ( i = 0; i < nf; i++ )
1471 qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
1472 }
1473#endif
1474#ifndef QT_NO_REGEXP_ANCHOR_ALT
1475 for ( i = 0; i < (int) aa.size(); i++ )
1476 qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
1477 aa[i].b );
1478#endif
1479}
1480#endif
1481
1482void QRegExpEngine::setup( bool caseSensitive )
1483{
1484#ifndef QT_NO_REGEXP_OPTIM
1485 if ( engCount++ == 0 ) {
1486 noOccurrences = new QArray<int>( NumBadChars );
1487 firstOccurrenceAtZero = new QArray<int>( NumBadChars );
1488 noOccurrences->fill( NoOccurrence );
1489 firstOccurrenceAtZero->fill( 0 );
1490 }
1491#endif
1492 s.setAutoDelete( TRUE );
1493 s.resize( 32 );
1494 ns = 0;
1495#ifndef QT_NO_REGEXP_CAPTURE
1496 f.resize( 32 );
1497 nf = 0;
1498 cf = -1;
1499#endif
1500 realncap = 0;
1501 ncap = 0;
1502#ifndef QT_NO_REGEXP_CCLASS
1503 cl.setAutoDelete( TRUE );
1504#endif
1505#ifndef QT_NO_REGEXP_LOOKAHEAD
1506 ahead.setAutoDelete( TRUE );
1507#endif
1508#ifndef QT_NO_REGEXP_OPTIM
1509 caretAnchored = TRUE;
1510#endif
1511 valid = FALSE;
1512 cs = caseSensitive;
1513#ifndef QT_NO_REGEXP_BACKREF
1514 nbrefs = 0;
1515#endif
1516#ifndef QT_NO_REGEXP_OPTIM
1517 useGoodStringHeuristic = FALSE;
1518 minl = 0;
1519 occ1 = *firstOccurrenceAtZero;
1520#endif
1521 mmCapturedNoMatch.fill( -1, 2 );
1522}
1523
1524int QRegExpEngine::setupState( int match )
1525{
1526 if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
1527 s.resize( (ns + 1) << 1 );
1528#ifndef QT_NO_REGEXP_CAPTURE
1529 s.insert( ns, new State(cf, match) );
1530#else
1531 s.insert( ns, new State(match) );
1532#endif
1533 return ns++;
1534}
1535
1536#ifndef QT_NO_REGEXP_CAPTURE
1537/*
1538 Functions startAtom() and finishAtom() should be called to delimit atoms.
1539 When a state is created, it is assigned to the current atom. The information
1540 is later used for capturing.
1541*/
1542int QRegExpEngine::startAtom( bool capture )
1543{
1544 if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
1545 f.resize( (nf + 1) << 1 );
1546 f[nf].parent = cf;
1547 cf = nf++;
1548 f[cf].capture = capture ? ncap++ : -1;
1549 return cf;
1550}
1551#endif
1552
1553#ifndef QT_NO_REGEXP_LOOKAHEAD
1554/*
1555 Creates a lookahead anchor.
1556*/
1557int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
1558{
1559 int n = ahead.size();
1560 if ( n == MaxLookaheads ) {
1561 yyError = TRUE;
1562 return 0;
1563 }
1564 ahead.resize( n + 1 );
1565 ahead.insert( n, new Lookahead(eng, negative) );
1566 return Anchor_FirstLookahead << n;
1567}
1568#endif
1569
1570#ifndef QT_NO_REGEXP_CAPTURE
1571/*
1572 We want the longest leftmost captures.
1573*/
1574bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
1575 const int *begin2, const int *end2 )
1576{
1577 for ( int i = 0; i < ncap; i++ ) {
1578 int delta = begin2[i] - begin1[i]; // it has to start early...
1579 if ( delta == 0 )
1580 delta = end1[i] - end2[i]; // ...and end late (like a party)
1581
1582 if ( delta != 0 )
1583 return delta > 0;
1584 }
1585 return FALSE;
1586}
1587#endif
1588
1589/*
1590 Returns TRUE if anchor a matches at position mmPos + i in the input string,
1591 otherwise FALSE.
1592*/
1593bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
1594{
1595 int j;
1596
1597#ifndef QT_NO_REGEXP_ANCHOR_ALT
1598 if ( (a & Anchor_Alternation) != 0 ) {
1599 return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
1600 testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
1601 }
1602#endif
1603
1604 if ( (a & Anchor_Caret) != 0 ) {
1605 if ( mmPos + i != 0 )
1606 return FALSE;
1607 }
1608 if ( (a & Anchor_Dollar) != 0 ) {
1609 if ( mmPos + i != mmLen )
1610 return FALSE;
1611 }
1612#ifndef QT_NO_REGEXP_ESCAPE
1613 if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
1614 bool before = FALSE, after = FALSE;
1615 if ( mmPos + i != 0 )
1616 before = mmIn[mmPos + i - 1].isLetterOrNumber();
1617 if ( mmPos + i != mmLen )
1618 after = mmIn[mmPos + i].isLetterOrNumber();
1619 if ( (a & Anchor_Word) != 0 && (before == after) )
1620 return FALSE;
1621 if ( (a & Anchor_NonWord) != 0 && (before != after) )
1622 return FALSE;
1623 }
1624#endif
1625#ifndef QT_NO_REGEXP_LOOKAHEAD
1626 bool catchx = TRUE;
1627
1628 if ( (a & Anchor_LookaheadMask) != 0 ) {
1629 QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
1630 mmLen - mmPos - i );
1631 for ( j = 0; j < (int) ahead.size(); j++ ) {
1632 if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
1633 catchx = ( ahead[j]->eng->match(cstr.string(), 0, TRUE,
1634 TRUE)[0] == 0 );
1635 if ( catchx == ahead[j]->neg )
1636 return FALSE;
1637 }
1638 }
1639 }
1640#endif
1641#ifndef QT_NO_REGEXP_CAPTURE
1642#ifndef QT_NO_REGEXP_BACKREF
1643 for ( j = 0; j < nbrefs; j++ ) {
1644 if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
1645 if ( capBegin[j] != EmptyCapture )
1646 return FALSE;
1647 }
1648 }
1649#endif
1650#endif
1651 return TRUE;
1652}
1653
1654#ifndef QT_NO_REGEXP_OPTIM
1655/*
1656 The three following functions are what Jeffrey Friedl would call transmissions
1657 (or bump-alongs). Using one or the other should make no difference, except
1658 in performance.
1659*/
1660
1661bool QRegExpEngine::goodStringMatch()
1662{
1663 int k = mmPos + goodEarlyStart;
1664
1665 while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
1666 int from = k - goodLateStart;
1667 int to = k - goodEarlyStart;
1668 if ( from > mmPos )
1669 mmPos = from;
1670
1671 while ( mmPos <= to ) {
1672 if ( matchHere() )
1673 return TRUE;
1674 mmPos++;
1675 }
1676 k++;
1677 }
1678 return FALSE;
1679}
1680
1681bool QRegExpEngine::badCharMatch()
1682{
1683 int slideHead = 0;
1684 int slideNext = 0;
1685 int i;
1686 int lastPos = mmLen - minl;
1687 memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
1688
1689 /*
1690 Set up the slide table, used for the bad-character heuristic, using
1691 the table of first occurrence of each character.
1692 */
1693 for ( i = 0; i < minl; i++ ) {
1694 int sk = occ1[BadChar(mmIn[mmPos + i])];
1695 if ( sk == NoOccurrence )
1696 sk = i + 1;
1697 if ( sk > 0 ) {
1698 int k = i + 1 - sk;
1699 if ( k < 0 ) {
1700 sk = i + 1;
1701 k = 0;
1702 }
1703 if ( sk > mmSlideTab[k] )
1704 mmSlideTab[k] = sk;
1705 }
1706 }
1707
1708 if ( mmPos > lastPos )
1709 return FALSE;
1710
1711 while ( TRUE ) {
1712 if ( ++slideNext >= mmSlideTabSize )
1713 slideNext = 0;
1714 if ( mmSlideTab[slideHead] > 0 ) {
1715 if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
1716 mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
1717 mmSlideTab[slideHead] = 0;
1718 } else {
1719 if ( matchHere() )
1720 return TRUE;
1721 }
1722
1723 if ( mmPos == lastPos )
1724 break;
1725
1726 /*
1727 Update the slide table. This code has much in common with the
1728 initialization code.
1729 */
1730 int sk = occ1[BadChar(mmIn[mmPos + minl])];
1731 if ( sk == NoOccurrence ) {
1732 mmSlideTab[slideNext] = minl;
1733 } else if ( sk > 0 ) {
1734 int k = slideNext + minl - sk;
1735 if ( k >= mmSlideTabSize )
1736 k -= mmSlideTabSize;
1737 if ( sk > mmSlideTab[k] )
1738 mmSlideTab[k] = sk;
1739 }
1740 slideHead = slideNext;
1741 mmPos++;
1742 }
1743 return FALSE;
1744}
1745#else
1746bool QRegExpEngine::bruteMatch()
1747{
1748 while ( mmPos <= mmLen ) {
1749 if ( matchHere() )
1750 return TRUE;
1751 mmPos++;
1752 }
1753 return FALSE;
1754}
1755#endif
1756
1757/*
1758 Here's the core of the engine. It tries to do a match here and now.
1759*/
1760bool QRegExpEngine::matchHere()
1761{
1762 int ncur = 1, nnext = 0;
1763 int i = 0, j, k, m;
1764 bool match = FALSE;
1765
1766 mmMatchedLen = -1;
1767 mmCurStack[0] = InitialState;
1768
1769#ifndef QT_NO_REGEXP_CAPTURE
1770 if ( ncap > 0 ) {
1771 for ( j = 0; j < ncap; j++ ) {
1772 mmCurCapBegin[j] = EmptyCapture;
1773 mmCurCapEnd[j] = EmptyCapture;
1774 }
1775 }
1776#endif
1777
1778#ifndef QT_NO_REGEXP_BACKREF
1779 int *zzZ = 0;
1780
1781 while ( (ncur > 0 || mmSleeping.count() > 0) && i <= mmLen - mmPos &&
1782 !match )
1783#else
1784 while ( ncur > 0 && i <= mmLen - mmPos && !match )
1785#endif
1786 {
1787 int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
1788 for ( j = 0; j < ncur; j++ ) {
1789 int cur = mmCurStack[j];
1790 State *scur = s[cur];
1791 QArray<int>& outs = scur->outs;
1792 for ( k = 0; k < (int) outs.size(); k++ ) {
1793 int next = outs[k];
1794 State *snext = s[next];
1795 bool in = TRUE;
1796#ifndef QT_NO_REGEXP_BACKREF
1797 int needSomeSleep = 0;
1798#endif
1799
1800 /*
1801 First, check if the anchors are anchored properly.
1802 */
1803 if ( scur->anchors != 0 ) {
1804 int a = at( *scur->anchors, next );
1805 if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
1806 in = FALSE;
1807 }
1808 /*
1809 If indeed they are, check if the input character is correct
1810 for this transition.
1811 */
1812 if ( in ) {
1813 m = snext->match;
1814 if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
1815 if ( cs )
1816 in = ( m == ch );
1817 else
1818 in = ( QChar(m).lower() == QChar(ch).lower() );
1819 } else if ( next == FinalState ) {
1820 mmMatchedLen = i;
1821 match = mmMinimal;
1822 in = TRUE;
1823 } else if ( (m & CharClassBit) != 0 ) {
1824#ifndef QT_NO_REGEXP_CCLASS
1825 const CharClass *cc = cl[m ^ CharClassBit];
1826 if ( cs )
1827 in = cc->in( ch );
1828 else if ( cc->negative() )
1829 in = cc->in( QChar(ch).lower() ) &&
1830 cc->in( QChar(ch).upper() );
1831 else
1832 in = cc->in( QChar(ch).lower() ) ||
1833 cc->in( QChar(ch).upper() );
1834#endif
1835#ifndef QT_NO_REGEXP_BACKREF
1836 } else { /* ( (m & BackRefBit) != 0 ) */
1837 int bref = m ^ BackRefBit;
1838 int ell = j * ncap + ( bref - 1 );
1839
1840 in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
1841 if ( in ) {
1842 if ( cs )
1843 in = ( mmIn[mmPos + mmCurCapBegin[ell]]
1844 == QChar(ch) );
1845 else
1846 in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
1847 == QChar(ch).lower() );
1848 }
1849
1850 if ( in ) {
1851 int delta;
1852 if ( mmCurCapEnd[ell] == EmptyCapture )
1853 delta = i - mmCurCapBegin[ell];
1854 else
1855 delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
1856
1857 in = ( delta <= mmLen - mmPos );
1858 if ( in && delta > 1 ) {
1859 int n;
1860 if ( cs ) {
1861 for ( n = 1; n < delta; n++ ) {
1862 if ( mmIn[mmPos +
1863 mmCurCapBegin[ell] + n] !=
1864 mmIn[mmPos + i + n] )
1865 break;
1866 }
1867 } else {
1868 for ( n = 1; n < delta; n++ ) {
1869 QChar a = mmIn[mmPos +
1870 mmCurCapBegin[ell] + n];
1871 QChar b = mmIn[mmPos + i + n];
1872 if ( a.lower() != b.lower() )
1873 break;
1874 }
1875 }
1876 in = ( n == delta );
1877 if ( in )
1878 needSomeSleep = delta - 1;
1879 }
1880 }
1881#endif
1882 }
1883 }
1884
1885 /*
1886 All is right. We must now update our data structures.
1887 */
1888 if ( in ) {
1889#ifndef QT_NO_REGEXP_CAPTURE
1890 int *capBegin, *capEnd;
1891#endif
1892 /*
1893 If the next state was not encountered yet, all is fine.
1894 */
1895 if ( (m = mmInNextStack[next]) == -1 ) {
1896 m = nnext++;
1897 mmNextStack[m] = next;
1898 mmInNextStack[next] = m;
1899#ifndef QT_NO_REGEXP_CAPTURE
1900 capBegin = mmNextCapBegin + m * ncap;
1901 capEnd = mmNextCapEnd + m * ncap;
1902
1903 /*
1904 Otherwise, we'll first maintain captures in temporary
1905 arrays, and decide at the end whether it's best to keep
1906 the previous capture zones or the new ones.
1907 */
1908 } else {
1909 capBegin = mmTempCapBegin;
1910 capEnd = mmTempCapEnd;
1911#endif
1912 }
1913
1914#ifndef QT_NO_REGEXP_CAPTURE
1915 /*
1916 Updating the capture zones is much of a task.
1917 */
1918 if ( ncap > 0 ) {
1919 memcpy( capBegin, mmCurCapBegin + j * ncap,
1920 ncap * sizeof(int) );
1921 memcpy( capEnd, mmCurCapEnd + j * ncap,
1922 ncap * sizeof(int) );
1923 int c = scur->atom, n = snext->atom;
1924 int p = -1, q = -1;
1925 int cap;
1926
1927 /*
1928 Lemma 1. For any x in the range [0..nf), we have
1929 f[x].parent < x.
1930
1931 Proof. By looking at startAtom(), it is clear that
1932 cf < nf holds all the time, and thus that
1933 f[nf].parent < nf.
1934 */
1935
1936 /*
1937 If we are reentering an atom, we empty all capture
1938 zones inside it.
1939 */
1940 if ( scur->reenter != 0 &&
1941 (q = at(*scur->reenter, next)) != 0 ) {
1942 QBitArray b;
1943 b.fill( FALSE, nf );
1944 b.setBit( q, TRUE );
1945 for ( int ell = q + 1; ell < nf; ell++ ) {
1946 if ( b.testBit(f[ell].parent) ) {
1947 b.setBit( ell, TRUE );
1948 cap = f[ell].capture;
1949 if ( cap >= 0 ) {
1950 capBegin[cap] = EmptyCapture;
1951 capEnd[cap] = EmptyCapture;
1952 }
1953 }
1954 }
1955 p = f[q].parent;
1956
1957 /*
1958 Otherwise, close the capture zones we are leaving.
1959 We are leaving f[c].capture, f[f[c].parent].capture,
1960 f[f[f[c].parent].parent].capture, ..., until
1961 f[x].capture, with x such that f[x].parent is the
1962 youngest common ancestor for c and n.
1963
1964 We go up along c's and n's ancestry until we find x.
1965 */
1966 } else {
1967 p = c;
1968 q = n;
1969 while ( p != q ) {
1970 if ( p > q ) {
1971 cap = f[p].capture;
1972 if ( cap >= 0 ) {
1973 if ( capBegin[cap] == i ) {
1974 capBegin[cap] = EmptyCapture;
1975 capEnd[cap] = EmptyCapture;
1976 } else {
1977 capEnd[cap] = i;
1978 }
1979 }
1980 p = f[p].parent;
1981 } else {
1982 q = f[q].parent;
1983 }
1984 }
1985 }
1986
1987 /*
1988 In any case, we now open the capture zones we are
1989 entering. We work upwards from n until we reach p
1990 (the parent of the atom we reenter or the youngest
1991 common ancestor).
1992 */
1993 while ( n > p ) {
1994 cap = f[n].capture;
1995 if ( cap >= 0 ) {
1996 capBegin[cap] = i;
1997 capEnd[cap] = EmptyCapture;
1998 }
1999 n = f[n].parent;
2000 }
2001 /*
2002 If the next state was already in mmNextStack, we must
2003 choose carefully which capture zones we want to keep.
2004 */
2005 if ( capBegin == mmTempCapBegin &&
2006 isBetterCapture(capBegin, capEnd,
2007 mmNextCapBegin + m * ncap,
2008 mmNextCapEnd + m * ncap) ) {
2009 memcpy( mmNextCapBegin + m * ncap, capBegin,
2010 ncap * sizeof(int) );
2011 memcpy( mmNextCapEnd + m * ncap, capEnd,
2012 ncap * sizeof(int) );
2013 }
2014 }
2015#ifndef QT_NO_REGEXP_BACKREF
2016 /*
2017 We are done with updating the capture zones. It's now
2018 time to put the next state to sleep, if it needs to, and
2019 to remove it from mmNextStack.
2020 */
2021 if ( needSomeSleep > 0 ) {
2022 zzZ = new int[1 + 2 * ncap];
2023 zzZ[0] = next;
2024 if ( ncap > 0 ) {
2025 memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
2026 memcpy( zzZ + 1 + ncap, capEnd,
2027 ncap * sizeof(int) );
2028 }
2029 mmInNextStack[mmNextStack[--nnext]] = -1;
2030 mmSleeping.insert( i + needSomeSleep, zzZ );
2031 }
2032#endif
2033#endif
2034 }
2035 }
2036 }
2037#ifndef QT_NO_REGEXP_CAPTURE
2038 /*
2039 If we reached the final state, hurray! Copy the captured zone.
2040 */
2041 if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
2042 memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
2043 memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
2044 }
2045#ifndef QT_NO_REGEXP_BACKREF
2046 /*
2047 It's time to wake up the sleepers.
2048 */
2049 if ( mmSleeping.count() > 0 ) {
2050 while ( (zzZ = mmSleeping.take(i)) != 0 ) {
2051 int next = zzZ[0];
2052 int *capBegin = zzZ + 1;
2053 int *capEnd = zzZ + 1 + ncap;
2054 bool copyOver = TRUE;
2055
2056 if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
2057 m = nnext++;
2058 mmNextStack[m] = next;
2059 mmInNextStack[next] = m;
2060 } else {
2061 copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
2062 mmNextCapEnd + m * ncap,
2063 capBegin, capEnd );
2064 }
2065 if ( copyOver ) {
2066 memcpy( mmNextCapBegin + m * ncap, capBegin,
2067 ncap * sizeof(int) );
2068 memcpy( mmNextCapEnd + m * ncap, capEnd,
2069 ncap * sizeof(int) );
2070 }
2071 delete[] zzZ;
2072 }
2073 }
2074#endif
2075#endif
2076 for ( j = 0; j < nnext; j++ )
2077 mmInNextStack[mmNextStack[j]] = -1;
2078
2079 qSwap( mmCurStack, mmNextStack );
2080#ifndef QT_NO_REGEXP_CAPTURE
2081 qSwap( mmCurCapBegin, mmNextCapBegin );
2082 qSwap( mmCurCapEnd, mmNextCapEnd );
2083#endif
2084 ncur = nnext;
2085 nnext = 0;
2086 i++;
2087 }
2088
2089#ifndef QT_NO_REGEXP_BACKREF
2090 /*
2091 If minimal matching is enabled, we might have some sleepers left.
2092 */
2093 while ( !mmSleeping.isEmpty() ) {
2094 zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
2095 delete[] zzZ;
2096 }
2097#endif
2098
2099 match = ( mmMatchedLen >= 0 );
2100 if ( !match )
2101 mmMatchedLen = i - 1;
2102 return match;
2103}
2104
2105#ifndef QT_NO_REGEXP_CCLASS
2106
2107QRegExpEngine::CharClass::CharClass()
2108 : c( 0 ), n( FALSE )
2109#ifndef QT_NO_REGEXP_OPTIM
2110 , occ1( *noOccurrences )
2111#endif
2112{
2113}
2114
2115QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
2116 const CharClass& cc )
2117{
2118 c = cc.c;
2119 r = cc.r.copy();
2120 n = cc.n;
2121#ifndef QT_NO_REGEXP_OPTIM
2122 occ1 = cc.occ1;
2123#endif
2124 return *this;
2125}
2126
2127void QRegExpEngine::CharClass::clear()
2128{
2129 c = 0;
2130 r.resize( 0 );
2131 n = FALSE;
2132}
2133
2134void QRegExpEngine::CharClass::setNegative( bool negative )
2135{
2136 n = negative;
2137#ifndef QT_NO_REGEXP_OPTIM
2138 occ1 = *firstOccurrenceAtZero;
2139#endif
2140}
2141
2142void QRegExpEngine::CharClass::addCategories( int cats )
2143{
2144 c |= cats;
2145#ifndef QT_NO_REGEXP_OPTIM
2146 occ1 = *firstOccurrenceAtZero;
2147#endif
2148}
2149
2150void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
2151{
2152 if ( from > to )
2153 qSwap( from, to );
2154 int n = r.size();
2155 r.resize( n + 1 );
2156 r[n].from = from;
2157 r[n].to = to;
2158
2159#ifndef QT_NO_REGEXP_OPTIM
2160 int i;
2161
2162 if ( to - from < NumBadChars ) {
2163 occ1.detach();
2164 if ( from % NumBadChars <= to % NumBadChars ) {
2165 for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
2166 occ1[i] = 0;
2167 } else {
2168 for ( i = 0; i <= to % NumBadChars; i++ )
2169 occ1[i] = 0;
2170 for ( i = from % NumBadChars; i < NumBadChars; i++ )
2171 occ1[i] = 0;
2172 }
2173 } else {
2174 occ1 = *firstOccurrenceAtZero;
2175 }
2176#endif
2177}
2178
2179bool QRegExpEngine::CharClass::in( QChar ch ) const
2180{
2181#ifndef QT_NO_REGEXP_OPTIM
2182 if ( occ1[BadChar(ch)] == NoOccurrence )
2183 return n;
2184#endif
2185
2186 if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
2187 return !n;
2188 for ( int i = 0; i < (int) r.size(); i++ ) {
2189 if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
2190 return !n;
2191 }
2192 return n;
2193}
2194
2195#if defined(QT_DEBUG)
2196void QRegExpEngine::CharClass::dump() const
2197{
2198 int i;
2199 qDebug( " %stive character class", n ? "nega" : "posi" );
2200#ifndef QT_NO_REGEXP_CCLASS
2201 if ( c != 0 )
2202 qDebug( " categories 0x%.8x", c );
2203#endif
2204 for ( i = 0; i < (int) r.size(); i++ )
2205 qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
2206}
2207#endif
2208#endif
2209
2210QRegExpEngine::Box::Box( QRegExpEngine *engine )
2211 : eng( engine ), skipanchors( 0 )
2212#ifndef QT_NO_REGEXP_OPTIM
2213 , earlyStart( 0 ), lateStart( 0 ), maxl( 0 ), occ1( *noOccurrences )
2214#endif
2215{
2216 minl = 0;
2217}
2218
2219QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
2220{
2221 eng = b.eng;
2222 ls = b.ls;
2223 rs = b.rs;
2224 lanchors = b.lanchors;
2225 ranchors = b.ranchors;
2226 skipanchors = b.skipanchors;
2227#ifndef QT_NO_REGEXP_OPTIM
2228 earlyStart = b.earlyStart;
2229 lateStart = b.lateStart;
2230 str = b.str;
2231 leftStr = b.leftStr;
2232 rightStr = b.rightStr;
2233 maxl = b.maxl;
2234 occ1 = b.occ1;
2235#endif
2236 minl = b.minl;
2237 return *this;
2238}
2239
2240void QRegExpEngine::Box::set( QChar ch )
2241{
2242 ls.resize( 1 );
2243 ls[0] = eng->createState( ch );
2244 rs = ls;
2245 rs.detach();
2246#ifndef QT_NO_REGEXP_OPTIM
2247 str = ch;
2248 leftStr = ch;
2249 rightStr = ch;
2250 maxl = 1;
2251 occ1.detach();
2252 occ1[BadChar(ch)] = 0;
2253#endif
2254 minl = 1;
2255}
2256
2257void QRegExpEngine::Box::set( const CharClass& cc )
2258{
2259 ls.resize( 1 );
2260 ls[0] = eng->createState( cc );
2261 rs = ls;
2262 rs.detach();
2263#ifndef QT_NO_REGEXP_OPTIM
2264 maxl = 1;
2265 occ1 = cc.firstOccurrence();
2266#endif
2267 minl = 1;
2268}
2269
2270#ifndef QT_NO_REGEXP_BACKREF
2271void QRegExpEngine::Box::set( int bref )
2272{
2273 ls.resize( 1 );
2274 ls[0] = eng->createState( bref );
2275 rs = ls;
2276 rs.detach();
2277 if ( bref >= 1 && bref <= MaxBackRefs )
2278 skipanchors = Anchor_BackRef0Empty << bref;
2279#ifndef QT_NO_REGEXP_OPTIM
2280 maxl = InftyLen;
2281#endif
2282 minl = 0;
2283}
2284#endif
2285
2286void QRegExpEngine::Box::cat( const Box& b )
2287{
2288 eng->addCatTransitions( rs, b.ls );
2289 addAnchorsToEngine( b );
2290 if ( minl == 0 ) {
2291 mergeInto( &lanchors, b.lanchors );
2292 if ( skipanchors != 0 ) {
2293 for ( int i = 0; i < (int) b.ls.size(); i++ ) {
2294 int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
2295 skipanchors );
2296 lanchors.insert( b.ls[i], a );
2297 }
2298 }
2299 mergeInto( &ls, b.ls );
2300 }
2301 if ( b.minl == 0 ) {
2302 mergeInto( &ranchors, b.ranchors );
2303 if ( b.skipanchors != 0 ) {
2304 for ( int i = 0; i < (int) rs.size(); i++ ) {
2305 int a = eng->anchorConcatenation( at(ranchors, rs[i]),
2306 b.skipanchors );
2307 ranchors.insert( rs[i], a );
2308 }
2309 }
2310 mergeInto( &rs, b.rs );
2311 } else {
2312 ranchors = b.ranchors;
2313 rs = b.rs;
2314 }
2315
2316#ifndef QT_NO_REGEXP_OPTIM
2317 if ( maxl != InftyLen ) {
2318 if ( rightStr.length() + b.leftStr.length() >
2319 QMAX(str.length(), b.str.length()) ) {
2320 earlyStart = minl - rightStr.length();
2321 lateStart = maxl - rightStr.length();
2322 str = rightStr + b.leftStr;
2323 } else if ( b.str.length() > str.length() ) {
2324 earlyStart = minl + b.earlyStart;
2325 lateStart = maxl + b.lateStart;
2326 str = b.str;
2327 }
2328 }
2329
2330 if ( (int) leftStr.length() == maxl )
2331 leftStr += b.leftStr;
2332 if ( (int) b.rightStr.length() == b.maxl )
2333 rightStr += b.rightStr;
2334 else
2335 rightStr = b.rightStr;
2336
2337 if ( maxl == InftyLen || b.maxl == InftyLen )
2338 maxl = InftyLen;
2339 else
2340 maxl += b.maxl;
2341
2342 occ1.detach();
2343 for ( int i = 0; i < NumBadChars; i++ ) {
2344 if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
2345 occ1[i] = minl + b.occ1[i];
2346 }
2347#endif
2348
2349 minl += b.minl;
2350 if ( minl == 0 )
2351 skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
2352 else
2353 skipanchors = 0;
2354}
2355
2356void QRegExpEngine::Box::orx( const Box& b )
2357{
2358 mergeInto( &ls, b.ls );
2359 mergeInto( &lanchors, b.lanchors );
2360 mergeInto( &rs, b.rs );
2361 mergeInto( &ranchors, b.ranchors );
2362 skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
2363
2364#ifndef QT_NO_REGEXP_OPTIM
2365 occ1.detach();
2366 for ( int i = 0; i < NumBadChars; i++ ) {
2367 if ( occ1[i] > b.occ1[i] )
2368 occ1[i] = b.occ1[i];
2369 }
2370 earlyStart = 0;
2371 lateStart = 0;
2372 str = QString::null;
2373 leftStr = QString::null;
2374 rightStr = QString::null;
2375 if ( b.maxl > maxl )
2376 maxl = b.maxl;
2377#endif
2378 if ( b.minl < minl )
2379 minl = b.minl;
2380}
2381
2382void QRegExpEngine::Box::plus( int atom )
2383{
2384#ifndef QT_NO_REGEXP_CAPTURE
2385 eng->addPlusTransitions( rs, ls, atom );
2386#else
2387 Q_UNUSED( atom );
2388 eng->addCatTransitions( rs, ls );
2389#endif
2390 addAnchorsToEngine( *this );
2391#ifndef QT_NO_REGEXP_OPTIM
2392 maxl = InftyLen;
2393#endif
2394}
2395
2396void QRegExpEngine::Box::opt()
2397{
2398#ifndef QT_NO_REGEXP_OPTIM
2399 earlyStart = 0;
2400 lateStart = 0;
2401 str = QString::null;
2402 leftStr = QString::null;
2403 rightStr = QString::null;
2404#endif
2405 skipanchors = 0;
2406 minl = 0;
2407}
2408
2409void QRegExpEngine::Box::catAnchor( int a )
2410{
2411 if ( a != 0 ) {
2412 for ( int i = 0; i < (int) rs.size(); i++ ) {
2413 a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
2414 ranchors.insert( rs[i], a );
2415 }
2416 if ( minl == 0 )
2417 skipanchors = eng->anchorConcatenation( skipanchors, a );
2418 }
2419}
2420
2421#ifndef QT_NO_REGEXP_OPTIM
2422void QRegExpEngine::Box::setupHeuristics()
2423{
2424 eng->setupGoodStringHeuristic( earlyStart, lateStart, str );
2425
2426 /*
2427 A regular expression such as 112|1 has occ1['2'] = 2 and minl = 1 at this
2428 point. An entry of occ1 has to be at most minl or infinity for the rest
2429 of the algorithm to go well.
2430
2431 We waited until here before normalizing these cases (instead of doing it
2432 in Box::orx()) because sometimes things improve by themselves; consider
2433 (112|1)34.
2434 */
2435 for ( int i = 0; i < NumBadChars; i++ ) {
2436 if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
2437 occ1[i] = minl;
2438 }
2439 eng->setupBadCharHeuristic( minl, occ1 );
2440
2441 eng->heuristicallyChooseHeuristic();
2442}
2443#endif
2444
2445#if defined(QT_DEBUG)
2446void QRegExpEngine::Box::dump() const
2447{
2448 int i;
2449 qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
2450 qDebug( " Left states:" );
2451 for ( i = 0; i < (int) ls.size(); i++ ) {
2452 if ( at(lanchors, ls[i]) == 0 )
2453 qDebug( " %d", ls[i] );
2454 else
2455 qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
2456 }
2457 qDebug( " Right states:" );
2458 for ( i = 0; i < (int) rs.size(); i++ ) {
2459 if ( at(ranchors, ls[i]) == 0 )
2460 qDebug( " %d", rs[i] );
2461 else
2462 qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
2463 }
2464 qDebug( " Skip anchors: 0x%.8x", skipanchors );
2465}
2466#endif
2467
2468void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
2469{
2470 for ( int i = 0; i < (int) to.ls.size(); i++ ) {
2471 for ( int j = 0; j < (int) rs.size(); j++ ) {
2472 int a = eng->anchorConcatenation( at(ranchors, rs[j]),
2473 at(to.lanchors, to.ls[i]) );
2474 eng->addAnchors( rs[j], to.ls[i], a );
2475 }
2476 }
2477}
2478
2479int QRegExpEngine::getChar()
2480{
2481 return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
2482}
2483
2484int QRegExpEngine::getEscape()
2485{
2486#ifndef QT_NO_REGEXP_ESCAPE
2487 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2488 const char backTab[] = "\a\f\n\r\t\v";
2489 ushort low;
2490 int i;
2491#endif
2492 ushort val;
2493 int prevCh = yyCh;
2494
2495 if ( prevCh == EOS ) {
2496 yyError = TRUE;
2497 return Tok_Char | '\\';
2498 }
2499 yyCh = getChar();
2500#ifndef QT_NO_REGEXP_ESCAPE
2501 if ( (prevCh & ~0xff) == 0 ) {
2502 const char *p = strchr( tab, prevCh );
2503 if ( p != 0 )
2504 return Tok_Char | backTab[p - tab];
2505 }
2506#endif
2507
2508 switch ( prevCh ) {
2509#ifndef QT_NO_REGEXP_ESCAPE
2510 case '0':
2511 val = 0;
2512 for ( i = 0; i < 3; i++ ) {
2513 if ( yyCh >= '0' && yyCh <= '7' )
2514 val = ( val << 3 ) | ( yyCh - '0' );
2515 else
2516 break;
2517 yyCh = getChar();
2518 }
2519 if ( (val & ~0377) != 0 )
2520 yyError = TRUE;
2521 return Tok_Char | val;
2522#endif
2523#ifndef QT_NO_REGEXP_ESCAPE
2524 case 'B':
2525 return Tok_NonWord;
2526#endif
2527#ifndef QT_NO_REGEXP_CCLASS
2528 case 'D':
2529 // see QChar::isDigit()
2530 yyCharClass->addCategories( 0x7fffffef );
2531 return Tok_CharClass;
2532 case 'S':
2533 // see QChar::isSpace()
2534 yyCharClass->addCategories( 0x7ffff87f );
2535 yyCharClass->addRange( 0x0000, 0x0008 );
2536 yyCharClass->addRange( 0x000e, 0x001f );
2537 yyCharClass->addRange( 0x007f, 0x009f );
2538 return Tok_CharClass;
2539 case 'W':
2540 // see QChar::isLetterOrNumber()
2541 yyCharClass->addCategories( 0x7ff07f8f );
2542 return Tok_CharClass;
2543#endif
2544#ifndef QT_NO_REGEXP_ESCAPE
2545 case 'b':
2546 return Tok_Word;
2547#endif
2548#ifndef QT_NO_REGEXP_CCLASS
2549 case 'd':
2550 // see QChar::isDigit()
2551 yyCharClass->addCategories( 0x00000010 );
2552 return Tok_CharClass;
2553 case 's':
2554 // see QChar::isSpace()
2555 yyCharClass->addCategories( 0x00000380 );
2556 yyCharClass->addRange( 0x0009, 0x000d );
2557 return Tok_CharClass;
2558 case 'w':
2559 // see QChar::isLetterOrNumber()
2560 yyCharClass->addCategories( 0x000f8070 );
2561 return Tok_CharClass;
2562#endif
2563#ifndef QT_NO_REGEXP_ESCAPE
2564 case 'x':
2565 val = 0;
2566 for ( i = 0; i < 4; i++ ) {
2567 low = QChar( yyCh ).lower();
2568 if ( low >= '0' && low <= '9' )
2569 val = ( val << 4 ) | ( low - '0' );
2570 else if ( low >= 'a' && low <= 'f' )
2571 val = ( val << 4 ) | ( low - 'a' + 10 );
2572 else
2573 break;
2574 yyCh = getChar();
2575 }
2576 return Tok_Char | val;
2577#endif
2578 default:
2579 if ( prevCh >= '1' && prevCh <= '9' ) {
2580#ifndef QT_NO_REGEXP_BACKREF
2581 val = prevCh - '0';
2582 while ( yyCh >= '0' && yyCh <= '9' ) {
2583 val = ( val *= 10 ) | ( yyCh - '0' );
2584 yyCh = getChar();
2585 }
2586 return Tok_BackRef | val;
2587#else
2588 yyError = TRUE;
2589#endif
2590 }
2591 return Tok_Char | prevCh;
2592 }
2593}
2594
2595#ifndef QT_NO_REGEXP_INTERVAL
2596int QRegExpEngine::getRep( int def )
2597{
2598 if ( yyCh >= '0' && yyCh <= '9' ) {
2599 int rep = 0;
2600 do {
2601 rep = 10 * rep + yyCh - '0';
2602 if ( rep >= InftyRep ) {
2603 yyError = TRUE;
2604 rep = def;
2605 }
2606 yyCh = getChar();
2607 } while ( yyCh >= '0' && yyCh <= '9' );
2608 return rep;
2609 } else {
2610 return def;
2611 }
2612}
2613#endif
2614
2615#ifndef QT_NO_REGEXP_LOOKAHEAD
2616void QRegExpEngine::skipChars( int n )
2617{
2618 if ( n > 0 ) {
2619 yyPos += n - 1;
2620 yyCh = getChar();
2621 }
2622}
2623#endif
2624
2625void QRegExpEngine::startTokenizer( const QChar *rx, int len )
2626{
2627 yyIn = rx;
2628 yyPos0 = 0;
2629 yyPos = 0;
2630 yyLen = len;
2631 yyCh = getChar();
2632 yyCharClass = new CharClass;
2633 yyMinRep = 0;
2634 yyMaxRep = 0;
2635 yyError = FALSE;
2636}
2637
2638int QRegExpEngine::getToken()
2639{
2640#ifndef QT_NO_REGEXP_CCLASS
2641 ushort pendingCh = 0;
2642 bool charPending;
2643 bool rangePending;
2644 int tok;
2645#endif
2646 int prevCh = yyCh;
2647
2648 yyPos0 = yyPos - 1;
2649#ifndef QT_NO_REGEXP_CCLASS
2650 yyCharClass->clear();
2651#endif
2652 yyMinRep = 0;
2653 yyMaxRep = 0;
2654 yyCh = getChar();
2655 switch ( prevCh ) {
2656 case EOS:
2657 yyPos0 = yyPos;
2658 return Tok_Eos;
2659 case '$':
2660 return Tok_Dollar;
2661 case '(':
2662 if ( yyCh == '?' ) {
2663 prevCh = getChar();
2664 yyCh = getChar();
2665 switch ( prevCh ) {
2666#ifndef QT_NO_REGEXP_LOOKAHEAD
2667 case '!':
2668 return Tok_NegLookahead;
2669 case '=':
2670 return Tok_PosLookahead;
2671#endif
2672 case ':':
2673 return Tok_MagicLeftParen;
2674 default:
2675 yyError = TRUE;
2676 return Tok_MagicLeftParen;
2677 }
2678 } else {
2679 return Tok_LeftParen;
2680 }
2681 case ')':
2682 return Tok_RightParen;
2683 case '*':
2684 yyMinRep = 0;
2685 yyMaxRep = InftyRep;
2686 return Tok_Quantifier;
2687 case '+':
2688 yyMinRep = 1;
2689 yyMaxRep = InftyRep;
2690 return Tok_Quantifier;
2691 case '.':
2692#ifndef QT_NO_REGEXP_CCLASS
2693 yyCharClass->setNegative( TRUE );
2694#endif
2695 return Tok_CharClass;
2696 case '?':
2697 yyMinRep = 0;
2698 yyMaxRep = 1;
2699 return Tok_Quantifier;
2700 case '[':
2701#ifndef QT_NO_REGEXP_CCLASS
2702 if ( yyCh == '^' ) {
2703 yyCharClass->setNegative( TRUE );
2704 yyCh = getChar();
2705 }
2706 charPending = FALSE;
2707 rangePending = FALSE;
2708 do {
2709 if ( yyCh == '-' && charPending && !rangePending ) {
2710 rangePending = TRUE;
2711 yyCh = getChar();
2712 } else {
2713 if ( charPending && !rangePending ) {
2714 yyCharClass->addSingleton( pendingCh );
2715 charPending = FALSE;
2716 }
2717 if ( yyCh == '\\' ) {
2718 yyCh = getChar();
2719 tok = getEscape();
2720 if ( tok == Tok_Word )
2721 tok = '\b';
2722 } else {
2723 tok = Tok_Char | yyCh;
2724 yyCh = getChar();
2725 }
2726 if ( tok == Tok_CharClass ) {
2727 if ( rangePending ) {
2728 yyCharClass->addSingleton( '-' );
2729 yyCharClass->addSingleton( pendingCh );
2730 charPending = FALSE;
2731 rangePending = FALSE;
2732 }
2733 } else if ( (tok & Tok_Char) != 0 ) {
2734 if ( rangePending ) {
2735 yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
2736 charPending = FALSE;
2737 rangePending = FALSE;
2738 } else {
2739 pendingCh = tok ^ Tok_Char;
2740 charPending = TRUE;
2741 }
2742 } else {
2743 yyError = TRUE;
2744 }
2745 }
2746 } while ( yyCh != ']' && yyCh != EOS );
2747 if ( rangePending )
2748 yyCharClass->addSingleton( '-' );
2749 if ( charPending )
2750 yyCharClass->addSingleton( pendingCh );
2751 if ( yyCh == EOS )
2752 yyError = TRUE;
2753 else
2754 yyCh = getChar();
2755 return Tok_CharClass;
2756#else
2757 yyError = TRUE;
2758 return Tok_Char | '[';
2759#endif
2760 case '\\':
2761 return getEscape();
2762 case ']':
2763 yyError = TRUE;
2764 return Tok_Char | ']';
2765 case '^':
2766 return Tok_Caret;
2767#ifndef QT_NO_REGEXP_INTERVAL
2768 case '{':
2769 yyMinRep = getRep( 0 );
2770 yyMaxRep = yyMinRep;
2771 if ( yyCh == ',' ) {
2772 yyCh = getChar();
2773 yyMaxRep = getRep( InftyRep );
2774 }
2775 if ( yyMaxRep < yyMinRep )
2776 qSwap( yyMinRep, yyMaxRep );
2777 if ( yyCh != '}' )
2778 yyError = TRUE;
2779 yyCh = getChar();
2780 return Tok_Quantifier;
2781#else
2782 yyError = TRUE;
2783 return Tok_Char | '{';
2784#endif
2785 case '|':
2786 return Tok_Bar;
2787 case '}':
2788 yyError = TRUE;
2789 return Tok_Char | '}';
2790 default:
2791 return Tok_Char | prevCh;
2792 }
2793}
2794
2795int QRegExpEngine::parse( const QChar *pattern, int len )
2796{
2797 valid = TRUE;
2798 startTokenizer( pattern, len );
2799 yyTok = getToken();
2800#ifndef QT_NO_REGEXP_CAPTURE
2801 yyMayCapture = TRUE;
2802#else
2803 yyMayCapture = FALSE;
2804#endif
2805
2806#ifndef QT_NO_REGEXP_CAPTURE
2807 int atom = startAtom( FALSE );
2808#endif
2809 CharClass anything;
2810 Box box( this ); // create InitialState
2811 box.set( anything );
2812 Box rightBox( this ); // create FinalState
2813 rightBox.set( anything );
2814
2815 Box middleBox( this );
2816 parseExpression( &middleBox );
2817#ifndef QT_NO_REGEXP_CAPTURE
2818 finishAtom( atom );
2819#endif
2820#ifndef QT_NO_REGEXP_OPTIM
2821 middleBox.setupHeuristics();
2822#endif
2823 box.cat( middleBox );
2824 box.cat( rightBox );
2825 delete yyCharClass;
2826 yyCharClass = 0;
2827
2828 realncap = ncap;
2829#ifndef QT_NO_REGEXP_BACKREF
2830 if ( nbrefs > ncap )
2831 ncap = nbrefs;
2832#endif
2833
2834 mmCaptured.resize( 2 + 2 * realncap );
2835 mmCapturedNoMatch.fill( -1, 2 + 2 * realncap );
2836
2837 /*
2838 We use one QArray<int> for all the big data used a lot in matchHere() and
2839 friends.
2840 */
2841#ifndef QT_NO_REGEXP_OPTIM
2842 mmSlideTabSize = QMAX( minl + 1, 16 );
2843#else
2844 mmSlideTabSize = 0;
2845#endif
2846 mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
2847
2848 mmInNextStack = mmBigArray.data();
2849 memset( mmInNextStack, -1, ns * sizeof(int) );
2850 mmCurStack = mmInNextStack + ns;
2851 mmNextStack = mmInNextStack + 2 * ns;
2852
2853 mmCurCapBegin = mmInNextStack + 3 * ns;
2854 mmNextCapBegin = mmCurCapBegin + ncap * ns;
2855 mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
2856 mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
2857
2858 mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
2859 mmTempCapEnd = mmTempCapBegin + ncap;
2860 mmCapBegin = mmTempCapBegin + 2 * ncap;
2861 mmCapEnd = mmTempCapBegin + 3 * ncap;
2862
2863 mmSlideTab = mmTempCapBegin + 4 * ncap;
2864
2865 if ( yyError )
2866 return -1;
2867
2868#ifndef QT_NO_REGEXP_OPTIM
2869 State *sinit = s[InitialState];
2870 caretAnchored = ( sinit->anchors != 0 );
2871 if ( caretAnchored ) {
2872 QMap<int, int>& anchors = *sinit->anchors;
2873 QMap<int, int>::ConstIterator a;
2874 for ( a = anchors.begin(); a != anchors.end(); ++a ) {
2875#ifndef QT_NO_REGEXP_ANCHOR_ALT
2876 if ( (*a & Anchor_Alternation) != 0 )
2877 break;
2878#endif
2879 if ( (*a & Anchor_Caret) == 0 ) {
2880 caretAnchored = FALSE;
2881 break;
2882 }
2883 }
2884 }
2885#endif
2886 return yyPos0;
2887}
2888
2889void QRegExpEngine::parseAtom( Box *box )
2890{
2891#ifndef QT_NO_REGEXP_LOOKAHEAD
2892 QRegExpEngine *eng = 0;
2893 bool neg;
2894 int len;
2895#endif
2896
2897 switch ( yyTok ) {
2898 case Tok_Dollar:
2899 box->catAnchor( Anchor_Dollar );
2900 break;
2901 case Tok_Caret:
2902 box->catAnchor( Anchor_Caret );
2903 break;
2904#ifndef QT_NO_REGEXP_LOOKAHEAD
2905 case Tok_PosLookahead:
2906 case Tok_NegLookahead:
2907 neg = ( yyTok == Tok_NegLookahead );
2908 eng = new QRegExpEngine( cs );
2909 len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
2910 if ( len >= 0 )
2911 skipChars( len );
2912 else
2913 yyError = TRUE;
2914 box->catAnchor( addLookahead(eng, neg) );
2915 yyTok = getToken();
2916 if ( yyTok != Tok_RightParen )
2917 yyError = TRUE;
2918 break;
2919#endif
2920#ifndef QT_NO_REGEXP_ESCAPE
2921 case Tok_Word:
2922 box->catAnchor( Anchor_Word );
2923 break;
2924 case Tok_NonWord:
2925 box->catAnchor( Anchor_NonWord );
2926 break;
2927#endif
2928 case Tok_LeftParen:
2929 case Tok_MagicLeftParen:
2930 yyTok = getToken();
2931 parseExpression( box );
2932 if ( yyTok != Tok_RightParen )
2933 yyError = TRUE;
2934 break;
2935 case Tok_CharClass:
2936 box->set( *yyCharClass );
2937 break;
2938 default:
2939 if ( (yyTok & Tok_Char) != 0 )
2940 box->set( QChar(yyTok ^ Tok_Char) );
2941#ifndef QT_NO_REGEXP_BACKREF
2942 else if ( (yyTok & Tok_BackRef) != 0 )
2943 box->set( yyTok ^ Tok_BackRef );
2944#endif
2945 else
2946 yyError = TRUE;
2947 }
2948 yyTok = getToken();
2949}
2950
2951void QRegExpEngine::parseFactor( Box *box )
2952{
2953#ifndef QT_NO_REGEXP_CAPTURE
2954 int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
2955#else
2956 static const int atom = 0;
2957#endif
2958
2959#ifndef QT_NO_REGEXP_INTERVAL
2960#define YYREDO() \
2961 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
2962 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
2963
2964 const QChar *in = yyIn;
2965 int pos0 = yyPos0;
2966 int pos = yyPos;
2967 int len = yyLen;
2968 int ch = yyCh;
2969 CharClass charClass;
2970 if ( yyTok == Tok_CharClass )
2971 charClass = *yyCharClass;
2972 int tok = yyTok;
2973 bool mayCapture = yyMayCapture;
2974#endif
2975
2976 parseAtom( box );
2977#ifndef QT_NO_REGEXP_CAPTURE
2978 finishAtom( atom );
2979#endif
2980
2981 if ( yyTok == Tok_Quantifier ) {
2982 if ( yyMaxRep == InftyRep ) {
2983 box->plus( atom );
2984#ifndef QT_NO_REGEXP_INTERVAL
2985 } else if ( yyMaxRep == 0 ) {
2986 box->clear();
2987#endif
2988 }
2989 if ( yyMinRep == 0 )
2990 box->opt();
2991
2992#ifndef QT_NO_REGEXP_INTERVAL
2993 yyMayCapture = FALSE;
2994 int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
2995 int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
2996
2997 Box rightBox( this );
2998 int i;
2999
3000 for ( i = 0; i < beta; i++ ) {
3001 YYREDO();
3002 Box leftBox( this );
3003 parseAtom( &leftBox );
3004 leftBox.cat( rightBox );
3005 leftBox.opt();
3006 rightBox = leftBox;
3007 }
3008 for ( i = 0; i < alpha; i++ ) {
3009 YYREDO();
3010 Box leftBox( this );
3011 parseAtom( &leftBox );
3012 leftBox.cat( rightBox );
3013 rightBox = leftBox;
3014 }
3015 rightBox.cat( *box );
3016 *box = rightBox;
3017#endif
3018 yyTok = getToken();
3019#ifndef QT_NO_REGEXP_INTERVAL
3020 yyMayCapture = mayCapture;
3021#endif
3022 }
3023#undef YYREDO
3024}
3025
3026void QRegExpEngine::parseTerm( Box *box )
3027{
3028#ifndef QT_NO_REGEXP_OPTIM
3029 if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
3030 parseFactor( box );
3031#endif
3032 while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
3033 Box rightBox( this );
3034 parseFactor( &rightBox );
3035 box->cat( rightBox );
3036 }
3037}
3038
3039void QRegExpEngine::parseExpression( Box *box )
3040{
3041 parseTerm( box );
3042 while ( yyTok == Tok_Bar ) {
3043 Box rightBox( this );
3044 yyTok = getToken();
3045 parseTerm( &rightBox );
3046 box->orx( rightBox );
3047 }
3048}
3049
3050/*
3051 The class QRegExpPrivate contains the private data of a regular expression
3052 other than the automaton. It makes it possible for many QRegExp objects to
3053 use the same QRegExpEngine object with different QRegExpPrivate objects.
3054*/
3055struct QRegExpPrivate
3056{
3057 QString pattern; // regular-expression or wildcard pattern
3058 QString rxpattern; // regular-expression pattern
3059#ifndef QT_NO_REGEXP_WILDCARD
3060 bool wc; // wildcard mode?
3061#endif
3062 bool min; // minimal matching? (instead of maximal)
3063#ifndef QT_NO_REGEXP_CAPTURE
3064 QString t; // last string passed to QRegExp::search() or searchRev()
3065 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3066#endif
3067 QArray<int> captured; // what QRegExpEngine::search() returned last
3068
3069 QRegExpPrivate() { captured.fill( -1, 2 ); }
3070};
3071
3072#ifndef QT_NO_REGEXP_OPTIM
3073static QCache<QRegExpEngine> *engineCache = 0;
3074#endif
3075
3076static QRegExpEngine *newEngine( const QString& pattern, bool caseSensitive )
3077{
3078#ifndef QT_NO_REGEXP_OPTIM
3079 if ( engineCache != 0 ) {
3080 QRegExpEngine *eng = engineCache->take( pattern );
3081 if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
3082 delete eng;
3083 } else {
3084 eng->ref();
3085 return eng;
3086 }
3087 }
3088#endif
3089 return new QRegExpEngine( pattern, caseSensitive );
3090}
3091
3092static void derefEngine( QRegExpEngine *eng, const QString& pattern )
3093{
3094 if ( eng != 0 && eng->deref() ) {
3095#ifndef QT_NO_REGEXP_OPTIM
3096 if ( engineCache == 0 ) {
3097 engineCache = new QCache<QRegExpEngine>;
3098 engineCache->setAutoDelete( TRUE );
3099 }
3100 if ( !pattern.isNull() &&
3101 engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
3102 return;
3103#else
3104 Q_UNUSED( pattern );
3105#endif
3106 delete eng;
3107 }
3108}
3109
3110/*!
3111 Constructs an empty regexp.
3112
3113 \sa isValid()
3114*/
3115QRegExp3::QRegExp3()
3116{
3117 eng = new QRegExpEngine( TRUE );
3118 priv = new QRegExpPrivate;
3119 priv->pattern = QString::null;
3120#ifndef QT_NO_REGEXP_WILDCARD
3121 priv->wc = FALSE;
3122#endif
3123 priv->min = FALSE;
3124 compile( TRUE );
3125}
3126
3127/*!
3128 Constructs a regular expression object for the given \a pattern
3129 string. The pattern must be given using wildcard notation if \a
3130 wildcard is TRUE (default is FALSE). The pattern is case sensitive,
3131 unless \a caseSensitive is FALSE. Matching is greedy (maximal), but
3132 can be changed by calling setMinimal().
3133
3134 \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
3135*/
3136QRegExp3::QRegExp3( const QString& pattern, bool caseSensitive, bool wildcard )
3137{
3138 eng = 0;
3139 priv = new QRegExpPrivate;
3140 priv->pattern = pattern;
3141#ifndef QT_NO_REGEXP_WILDCARD
3142 priv->wc = wildcard;
3143#endif
3144 priv->min = FALSE;
3145 compile( caseSensitive );
3146}
3147
3148/*!
3149 Constructs a regular expression as a copy of \a rx.
3150
3151 \sa operator=()
3152*/
3153QRegExp3::QRegExp3( const QRegExp3& rx )
3154{
3155 eng = 0;
3156 priv = new QRegExpPrivate;
3157 operator=( rx );
3158}
3159
3160/*!
3161 Destroys the regular expression and cleans up its internal data.
3162*/
3163QRegExp3::~QRegExp3()
3164{
3165 derefEngine( eng, priv->rxpattern );
3166 delete priv;
3167}
3168
3169/*!
3170 Copies the regular expression \a rx and returns a reference to the copy.
3171 The case sensitivity, wildcard and minimal matching options are copied as
3172 well.
3173*/
3174QRegExp3& QRegExp3::operator=( const QRegExp3& rx )
3175{
3176 rx.eng->ref();
3177 derefEngine( eng, priv->rxpattern );
3178 eng = rx.eng;
3179 priv->pattern = rx.priv->pattern;
3180 priv->rxpattern = rx.priv->rxpattern;
3181#ifndef QT_NO_REGEXP_WILDCARD
3182 priv->wc = rx.priv->wc;
3183#endif
3184 priv->min = rx.priv->min;
3185#ifndef QT_NO_REGEXP_CAPTURE
3186 priv->t = rx.priv->t;
3187 priv->capturedCache = rx.priv->capturedCache;
3188#endif
3189 priv->captured = rx.priv->captured;
3190 return *this;
3191}
3192
3193/*!
3194 Returns TRUE if this regular expression is equal to \a rx, otherwise
3195 returns FALSE.
3196
3197 Two QRegExp3 objects are equal if they have the same pattern strings
3198 and the same settings for case sensitivity, wildcard and minimal
3199 matching.
3200*/
3201bool QRegExp3::operator==( const QRegExp3& rx ) const
3202{
3203 return priv->pattern == rx.priv->pattern &&
3204 eng->caseSensitive() == rx.eng->caseSensitive() &&
3205#ifndef QT_NO_REGEXP_WILDCARD
3206 priv->wc == rx.priv->wc &&
3207#endif
3208 priv->min == rx.priv->min;
3209}
3210
3211/*! \fn bool QRegExp3::operator!=( const QRegExp& rx ) const
3212
3213 Returns TRUE if this regular expression is not equal to \a rx, otherwise
3214 FALSE.
3215
3216 \sa operator==()
3217*/
3218
3219/*!
3220 Returns TRUE if the pattern string is empty, otherwise FALSE.
3221
3222 If you call match() with an empty pattern on an empty string it will
3223 return TRUE otherwise it returns FALSE since match() operates over the
3224 whole string. If you call search() with an empty pattern on \e any
3225 string it will return the start position (0 by default) since it will
3226 match at the start position, because the empty pattern matches the
3227 'emptiness' at the start of the string, and the length of the match
3228 returned by matchedLength() will be 0.
3229
3230 See QString::isEmpty().
3231*/
3232
3233bool QRegExp3::isEmpty() const
3234{
3235 return priv->pattern.isEmpty();
3236}
3237
3238/*!
3239 Returns TRUE if the regular expression is valid, or FALSE if it's invalid. An
3240 invalid regular expression never matches.
3241
3242 The pattern <b>[a-z</b> is an example of an invalid pattern, since it lacks
3243 a closing square bracket.
3244
3245 Note that the validity of a regexp may also depend on the setting of
3246 the wildcard flag, for example <b>*.html</b> is a valid wildcard
3247 regexp but an invalid full regexp.
3248*/
3249bool QRegExp3::isValid() const
3250{
3251 return eng->isValid();
3252}
3253
3254/*!
3255 Returns the pattern string of the regular expression. The pattern has either
3256 regular expression syntax or wildcard syntax, depending on wildcard().
3257
3258 \sa setPattern()
3259*/
3260QString QRegExp3::pattern() const
3261{
3262 return priv->pattern;
3263}
3264
3265/*!
3266 Sets the pattern string to \a pattern and returns a reference to this regular
3267 expression. The case sensitivity, wildcard and minimal matching options are
3268 not changed.
3269
3270 \sa pattern()
3271*/
3272void QRegExp3::setPattern( const QString& pattern )
3273{
3274 if ( priv->pattern != pattern ) {
3275 priv->pattern = pattern;
3276 compile( caseSensitive() );
3277 }
3278}
3279
3280/*!
3281 Returns TRUE if case sensitivity is enabled, otherwise FALSE. The default is
3282 TRUE.
3283
3284 \sa setCaseSensitive()
3285*/
3286bool QRegExp3::caseSensitive() const
3287{
3288 return eng->caseSensitive();
3289}
3290
3291/*!
3292 Sets case sensitive matching to \a sensitive.
3293
3294 If \a sensitive is TRUE, <b>\\</b><b>.txt$</b> matches
3295 <tt>readme.txt</tt> but not <tt>README.TXT</tt>.
3296
3297 \sa caseSensitive()
3298*/
3299void QRegExp3::setCaseSensitive( bool sensitive )
3300{
3301 if ( sensitive != eng->caseSensitive() )
3302 compile( sensitive );
3303}
3304
3305#ifndef QT_NO_REGEXP_WILDCARD
3306/*!
3307 Returns TRUE if wildcard mode is enabled, otherwise FALSE. The default is
3308 FALSE.
3309
3310 \sa setWildcard()
3311*/
3312bool QRegExp3::wildcard() const
3313{
3314 return priv->wc;
3315}
3316
3317/*! Sets the wildcard mode for the regular expression. The default is FALSE.
3318
3319 Setting \a wildcard to TRUE enables simple shell-like wildcard
3320 matching.
3321 (See <a href="#wildcard-matching">wildcard matching (globbing)</a>.)
3322
3323 For example, <b>r*.txt</b> matches the string <tt>readme.txt</tt> in wildcard
3324 mode, but does not match <tt>readme</tt>.
3325
3326 \sa wildcard()
3327*/
3328void QRegExp3::setWildcard( bool wildcard )
3329{
3330 if ( wildcard != priv->wc ) {
3331 priv->wc = wildcard;
3332 compile( caseSensitive() );
3333 }
3334}
3335#endif
3336
3337/*! Returns TRUE if minimal (non-greedy) matching is enabled, otherwise
3338 returns FALSE.
3339
3340 \sa setMinimal()
3341*/
3342bool QRegExp3::minimal() const
3343{
3344 return priv->min;
3345}
3346
3347/*!
3348 Enables or disables minimal matching. If \a minimal is FALSE, matching is
3349 greedy (maximal) which is the default.
3350
3351 For example, suppose we have the input string "We must be \<b>bold\</b>,
3352 very \<b>bold\</b>!" and the pattern <b>\<b>.*\</b></b>. With
3353 the default greedy (maximal) matching, the match is
3354 "We must be <u>\<b>bold\</b>, very \<b>bold\</b></u>!". But with
3355 minimal (non-greedy) matching the first match is:
3356 "We must be <u>\<b>bold\</b></u>, very \<b>bold\</b>!" and the
3357 second match is
3358 "We must be \<b>bold\</b>, very <u>\<b>bold\</b></u>!".
3359 In practice we might use the pattern <b>\<b>[^\<]+\</b></b>,
3360 although this will still fail for nested tags.
3361
3362 \sa minimal()
3363*/
3364void QRegExp3::setMinimal( bool minimal )
3365{
3366 priv->min = minimal;
3367}
3368
3369/*!
3370 Returns TRUE if \a str is matched exactly by this regular expression
3371 otherwise it returns FALSE. You can determine how much of the string was
3372 matched by calling matchedLength().
3373
3374 For a given regexp string, R, <tt>match("R")</tt> is the equivalent
3375 of <tt>search("^R$")</tt> since match() effectively encloses the
3376 regexp in the start of string and end of string anchors.
3377
3378 For example, if the regular expression is <b>blue</b>, then match()
3379 returns TRUE only for input <tt>blue</tt>. For inputs
3380 <tt>bluebell</tt>, <tt>blutak</tt> and <tt>lightblue</tt>, match()
3381 returns FALSE and matchedLength() will return 4, 3 and 0 respectively.
3382
3383 \sa search() searchRev() QRegExpValidator
3384*/
3385bool QRegExp3::exactMatch( const QString& str )
3386{
3387#ifndef QT_NO_REGEXP_CAPTURE
3388 priv->t = str;
3389 priv->capturedCache.clear();
3390#endif
3391
3392 priv->captured = eng->match( str, 0, priv->min, TRUE );
3393 if ( priv->captured[1] == (int) str.length() ) {
3394 return TRUE;
3395 } else {
3396 priv->captured.detach();
3397 priv->captured[0] = 0;
3398 priv->captured[1] = eng->matchedLength();
3399 return FALSE;
3400 }
3401}
3402
3403/*! \overload
3404
3405 This version does not set matchedLength(), capturedTexts() and friends.
3406*/
3407bool QRegExp3::exactMatch( const QString& str ) const
3408{
3409 return eng->match(str, 0, priv->min, TRUE)[0] == 0 &&
3410 eng->matchedLength() == (int) str.length();
3411}
3412
3413/*! \obsolete
3414
3415 Attempts to match in \a str, starting from position \a index. Returns the
3416 position of the match, or -1 if there was no match.
3417
3418 The length of the match is stored in \a *len, unless \a len is a null pointer.
3419
3420 If \a indexIsStart is TRUE (the default), the position \a index in the string
3421 will match the start of string anchor, <b>^</b>, in the regexp, if present.
3422 Otherwise, position 0 in \a str will match.
3423
3424 Use search() and matchedLength() instead of this function.
3425
3426 If you really need the \a indexIsStart functionality, try this:
3427
3428 \code
3429 QRegExp3 rx( "some pattern" );
3430 int pos = rx.search( str.mid( index ) );
3431 if ( pos != -1 )
3432 pos += index;
3433 int len = rx.matchedLength();
3434 \endcode
3435*/
3436#ifndef QT_NO_COMPAT
3437int QRegExp3::match( const QString& str, int index, int *len,
3438 bool indexIsStart )
3439{
3440 int pos;
3441 if ( indexIsStart ) {
3442 pos = search( str.mid(index) );
3443 if ( pos >= 0 ) {
3444 pos += index;
3445 if ( len != 0 )
3446 *len = matchedLength();
3447 } else {
3448 if ( len != 0 )
3449 *len = 0;
3450 }
3451 } else {
3452 pos = search( str, index );
3453 if ( len != 0 )
3454 *len = matchedLength();
3455 }
3456 return pos;
3457}
3458#endif
3459
3460/*!
3461 Attempts to find a match in \a str from position \a start (0 by default). If
3462 \a start is -1, the search starts at the last character; if -2, at the next to
3463 last character; etc.
3464
3465 Returns the position of the first match, or -1 if there was no match.
3466
3467 You might prefer to use QString::find(), QString::contains() or even
3468 QStringList::grep(). To replace matches use QString::replace().
3469
3470 Example:
3471 \code
3472 QString str = "offsets: 1.23 .50 71.00 6.00";
3473 QRegExp3 rx( "\\d*\\.\\d+" ); // very simple floating point matching
3474 int count = 0;
3475 int pos = 0;
3476 while ( pos >= 0 ) {
3477 pos = rx.search( str, pos );
3478 count++;
3479 }
3480 // pos will be 9, 14, 18 and finally 24; count will end up as 4.
3481 \endcode
3482
3483 \sa searchRev() match() matchedLength() capturedTexts()
3484*/
3485// QChar versions
3486
3487#ifdef QCHAR_SUPPORT
3488const QString makeString(const QChar *str)
3489{
3490// A sentinel value checked in case the QChar *ptr is never null terminated
3491 const uint MAXLENGTH=65535;
3492
3493 const QChar *s=str;
3494 uint i=0;
3495 while(i < MAXLENGTH && *s != QChar::null) { i++;s++ ;}
3496 return QString(str,i);
3497
3498}
3499int QRegExp3::search(const QChar *str,int start)
3500{
3501 return search(makeString(str),start);
3502}
3503int QRegExp3::search(const QChar *str,int start) const
3504{
3505 return search(makeString(str),start);
3506}
3507int QRegExp3::searchRev(const QChar *str,int start)
3508{
3509 return searchRev(makeString(str),start);
3510}
3511int QRegExp3::searchRev(const QChar *str,int start) const
3512{
3513 return searchRev(makeString(str),start);
3514}
3515bool QRegExp3::exactMatch(const QChar *str)
3516{
3517 return exactMatch(makeString(str));
3518}
3519bool QRegExp3::exactMatch(const QChar *str) const
3520{
3521 return exactMatch(makeString(str));
3522}
3523#endif // QCHAR_SUPPORT
3524
3525int QRegExp3::search( const QString& str, int start )
3526{
3527 if ( start < 0 )
3528 start += str.length();
3529#ifndef QT_NO_REGEXP_CAPTURE
3530 priv->t = str;
3531 priv->capturedCache.clear();
3532#endif
3533 priv->captured = eng->match( str, start, priv->min, FALSE );
3534 return priv->captured[0];
3535}
3536
3537/*! \overload
3538
3539 This version does not set matchedLength(), capturedTexts() and friends.
3540*/
3541int QRegExp3::search( const QString& str, int start ) const
3542{
3543 if ( start < 0 )
3544 start += str.length();
3545 return eng->match( str, start, priv->min, FALSE )[0];
3546}
3547
3548/*!
3549 Attempts to find a match backwards in \a str from position \a start. If
3550 \a start is -1 (the default), the search starts at the last character; if -2,
3551 at the next to last character; etc.
3552
3553 Returns the position of the first match, or -1 if there was no match.
3554
3555 You might prefer to use QString::findRev().
3556
3557 \sa search() matchedLength() capturedTexts()
3558*/
3559int QRegExp3::searchRev( const QString& str, int start )
3560{
3561 if ( start < 0 )
3562 start += str.length();
3563#ifndef QT_NO_REGEXP_CAPTURE
3564 priv->t = str;
3565 priv->capturedCache.clear();
3566#endif
3567 if ( start < 0 || start > (int) str.length() ) {
3568 priv->captured.detach();
3569 priv->captured.fill( -1 );
3570 return -1;
3571 }
3572
3573 while ( start >= 0 ) {
3574 priv->captured = eng->match( str, start, priv->min, TRUE );
3575 if ( priv->captured[0] == start )
3576 return start;
3577 start--;
3578 }
3579 return -1;
3580}
3581
3582/*! \overload
3583
3584 This version does not set matchedLength(), capturedText() and friends.
3585*/
3586int QRegExp3::searchRev( const QString& str, int start ) const
3587{
3588 if ( start < 0 )
3589 start += str.length();
3590 if ( start < 0 || start > (int) str.length() )
3591 return -1;
3592
3593 while ( start >= 0 ) {
3594 if ( eng->match(str, start, priv->min, TRUE)[0] == start )
3595 return start;
3596 start--;
3597 }
3598 return -1;
3599}
3600
3601/*!
3602 Returns the length of the last matched string, or -1 if there was no match.
3603
3604 \sa match() search()
3605*/
3606int QRegExp3::matchedLength()
3607{
3608 return priv->captured[1];
3609}
3610
3611#ifndef QT_NO_REGEXP_CAPTURE
3612/*!
3613 Returns a list of the captured text strings.
3614
3615 The first string in the list is the entire matched string. Each
3616 subsequent list element contains a string that matched a
3617 (capturing) subexpression of the regexp.
3618
3619 For example:
3620 \code
3621 QRegExp3 rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
3622 int pos = rx.search( "Length: 36 inches" );
3623 QStringList list = rx.capturedTexts();
3624 // list is now ( "36 inches", "36", " ", "inches", "es" ).
3625 \endcode
3626
3627 The above example also captures elements
3628 that may be present but which we have no interest in. This problem
3629 can be solved by using non-capturing parenthesis:
3630
3631 \code
3632 QRegExp3 rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
3633 int pos = rx.search( "Length: 36 inches" );
3634 QStringList list = rx.capturedTexts();
3635 // list is now ( "36 inches", "36", "inches" ).
3636 \endcode
3637
3638 Some regexps can match an indeterminate number of times. For example
3639 if the input string is "Offsets: 12 14 99 231 7" and the regexp,
3640 <tt>rx</tt>, is <b>(</b><b>\\</b><b>d+)+</b>, we would hope to get a
3641 list of all the numbers matched. However, after calling
3642 <tt>rx.search(str)</tt>, capturedTexts() will return the list ( "12",
3643 "12" ), i.e. the entire match was "12" and the first subexpression
3644 matched was "12". The correct approach is to use cap() in a
3645 <a href="#cap_in_a_loop">loop</a>.
3646
3647 The order of elements in the string list is as follows. The first
3648 element is the entire matching string. Each subsequent element
3649 corresponds to the next capturing open left parenthesis. Thus
3650 capturedTexts()[1] is the text of the first capturing parenthesis,
3651 capturedTexts()[2] is the text of the second and so on (corresponding
3652 to $1, $2 etc. in some other regexp languages).
3653
3654 \sa cap() pos()
3655*/
3656QStringList QRegExp3::capturedTexts()
3657{
3658 if ( priv->capturedCache.isEmpty() ) {
3659 for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
3660 QString m;
3661 if ( priv->captured[i + 1] == 0 )
3662 m = QString::fromLatin1( "" );
3663 else if ( priv->captured[i] >= 0 )
3664 m = priv->t.mid( priv->captured[i],
3665 priv->captured[i + 1] );
3666 priv->capturedCache.append( m );
3667 }
3668 priv->t = QString::null;
3669 }
3670 return priv->capturedCache;
3671}
3672
3673/*! Returns the text captured by the \a nth subexpression. The entire match
3674 has index 0 and the parenthesised subexpressions have indices starting
3675 from 1 (excluding non-capturing parenthesis).
3676
3677 \code
3678 QRegExp3 rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
3679 int pos = rxlen.search( "Length: 189cm" );
3680 if ( pos > -1 ) {
3681 QString value = rxlen.cap( 1 );// "189"
3682 QString unit = rxlen.cap( 2 ); // "cm"
3683 // ...
3684 }
3685 \endcode
3686
3687 <a name="cap_in_a_loop">
3688 Some patterns may lead to a number of matches which cannot be
3689 determined in advance, for example:</a>
3690
3691 \code
3692 QRegExp3 rx( "(\\d+)" );
3693 str = "Offsets: 12 14 99 231 7";
3694 QStringList list;
3695 pos = 0;
3696 while ( pos >= 0 ) {
3697 pos = rx.search( str, pos );
3698 if ( pos > -1 ) {
3699 list += rx.cap( 1 );
3700 pos += rx.matchedLength();
3701 }
3702 }
3703 // list contains: ( "12", "14", "99", "231", "7" ).
3704 \endcode
3705
3706 The order of elements matched by cap() is as follows. The first
3707 element, cap( 0 ), is the entire matching string. Each subsequent
3708 element corresponds to the next capturing open left parenthesis. Thus
3709 cap( 1 ) is the text of the first capturing parenthesis, cap( 2 ) is
3710 the text of the second and so on.
3711
3712 \sa search() pos() capturedTexts()
3713*/
3714QString QRegExp3::cap( int nth )
3715{
3716 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3717 return QString::null;
3718 else
3719 return capturedTexts()[nth];
3720}
3721
3722/*! Returns the position of the \a nth captured text in the
3723 searched string. If \a nth is 0 (the default), pos() returns the
3724 position of the whole match.
3725
3726 Example:
3727 \code
3728 QRegExp3 rx( "/([a-z]+)/([a-z]+)" );
3729 rx.search( "Output /dev/null" );// Returns 7 (position of /dev/null)
3730 rx.pos( 0 ); // Returns 7 (position of /dev/null)
3731 rx.pos( 1 ); // Returns 8 (position of dev)
3732 rx.pos( 2 ); // Returns 12 (position of null)
3733 \endcode
3734
3735 Note that pos() returns -1 for zero-length matches. (For example, if
3736 cap(4) would return an empty string, pos(4) returns -1.) This is due
3737 to an implementation tradeoff.
3738
3739 \sa capturedTexts() cap()
3740*/
3741int QRegExp3::pos( int nth )
3742{
3743 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3744 return -1;
3745 else
3746 return priv->captured[2 * nth];
3747}
3748#endif
3749
3750void QRegExp3::compile( bool caseSensitive )
3751{
3752 derefEngine( eng, priv->rxpattern );
3753#ifndef QT_NO_REGEXP_WILDCARD
3754 if ( priv->wc )
3755 priv->rxpattern = wc2rx( priv->pattern );
3756 else
3757#endif
3758 priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
3759 : priv->pattern;
3760 eng = newEngine( priv->rxpattern, caseSensitive );
3761#ifndef QT_NO_REGEXP_CAPTURE
3762 priv->t = QString::null;
3763 priv->capturedCache.clear();
3764#endif
3765 priv->captured.detach();
3766 priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
3767}
diff --git a/noncore/apps/tinykate/libkate/qt3back/qregexp3.h b/noncore/apps/tinykate/libkate/qt3back/qregexp3.h
new file mode 100644
index 0000000..5b75131
--- a/dev/null
+++ b/noncore/apps/tinykate/libkate/qt3back/qregexp3.h
@@ -0,0 +1,111 @@
1/****************************************************************************
2** $Id$
3**
4** Definition of QRegExp class
5**
6** Created : 950126
7**
8** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38#ifndef QREGEXP3_H
39#define QREGEXP3_H
40#ifndef QT_H
41#include "qstringlist.h"
42#endif // QT_H
43
44
45#if QT_VERSION >=300
46#include <qregexp.h>
47#else
48class QRegExpEngine;
49struct QRegExpPrivate;
50
51class Q_EXPORT QRegExp3
52{
53public:
54 QRegExp3();
55 QRegExp3( const QString& pattern, bool caseSensitive = TRUE,
56 bool wildcard = FALSE );
57 QRegExp3( const QRegExp3& rx );
58 ~QRegExp3();
59 QRegExp3& operator=( const QRegExp3& rx );
60
61 bool operator==( const QRegExp3& rx ) const;
62 bool operator!=( const QRegExp3& rx ) const { return !operator==( rx ); }
63
64 bool isEmpty() const;
65 bool isValid() const;
66 QString pattern() const;
67 void setPattern( const QString& pattern );
68 bool caseSensitive() const;
69 void setCaseSensitive( bool sensitive );
70#ifndef QT_NO_REGEXP_WILDCARD
71 bool wildcard() const;
72 void setWildcard( bool wildcard );
73#endif
74 bool minimal() const;
75 void setMinimal( bool minimal );
76
77 bool exactMatch( const QString& str );
78 bool exactMatch( const QString& str ) const;
79#ifndef QT_NO_COMPAT
80 int match( const QString& str, int index, int *len = 0,
81 bool indexIsStart = TRUE );
82#endif
83 int search( const QString& str, int start = 0 );
84 int search( const QString& str, int start = 0 ) const;
85// QChar versions
86#ifdef QCHAR_SUPPORT
87 int search(const QChar *str,int start=0);
88 int search(const QChar *str,int start=0) const;
89 int searchRev(const QChar *str,int start=-1);
90 int searchRev(const QChar *str,int start=-1) const ;
91 bool exactMatch(const QChar *str);
92 bool exactMatch(const QChar *str) const;
93 // end QChar versions
94#endif
95 int searchRev( const QString& str, int start = -1 );
96 int searchRev( const QString& str, int start = -1 ) const;
97 int matchedLength();
98#ifndef QT_NO_REGEXP_CAPTURE
99 QStringList capturedTexts();
100 QString cap( int nth = 0 );
101 int pos( int nth = 0 );
102#endif
103
104private:
105 void compile( bool caseSensitive );
106
107 QRegExpEngine *eng;
108 QRegExpPrivate *priv;
109};
110#endif // QT_VERSION >= 300
111#endif // QREGEXP_H