summaryrefslogtreecommitdiff
path: root/qmake/tools/qregexp.cpp
Unidiff
Diffstat (limited to 'qmake/tools/qregexp.cpp') (more/less context) (ignore whitespace changes)
-rw-r--r--qmake/tools/qregexp.cpp3935
1 files changed, 3935 insertions, 0 deletions
diff --git a/qmake/tools/qregexp.cpp b/qmake/tools/qregexp.cpp
new file mode 100644
index 0000000..500efed
--- a/dev/null
+++ b/qmake/tools/qregexp.cpp
@@ -0,0 +1,3935 @@
1/****************************************************************************
2** $Id$
3**
4** Implementation of QRegExp class
5**
6** Created : 950126
7**
8** Copyright (C) 1992-2000 Trolltech AS. All rights reserved.
9**
10** This file is part of the tools module of the Qt GUI Toolkit.
11**
12** This file may be distributed under the terms of the Q Public License
13** as defined by Trolltech AS of Norway and appearing in the file
14** LICENSE.QPL included in the packaging of this file.
15**
16** This file may be distributed and/or modified under the terms of the
17** GNU General Public License version 2 as published by the Free Software
18** Foundation and appearing in the file LICENSE.GPL included in the
19** packaging of this file.
20**
21** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22** licenses may use this file in accordance with the Qt Commercial License
23** Agreement provided with the Software.
24**
25** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27**
28** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29** information about Qt Commercial License Agreements.
30** See http://www.trolltech.com/qpl/ for QPL licensing information.
31** See http://www.trolltech.com/gpl/ for GPL licensing information.
32**
33** Contact info@trolltech.com if any conditions of this licensing are
34** not clear to you.
35**
36**********************************************************************/
37
38#include "qregexp.h"
39
40#ifndef QT_NO_REGEXP
41
42#include "qmemarray.h"
43#include "qbitarray.h"
44#include "qcache.h"
45#include "qcleanuphandler.h"
46#include "qintdict.h"
47#include "qmap.h"
48#include "qptrvector.h"
49#include "qstring.h"
50#include "qtl.h"
51
52#ifdef QT_THREAD_SUPPORT
53#include "qmutexpool_p.h"
54#endif // QT_THREAD_SUPPORT
55
56#undef QT_TRANSLATE_NOOP
57#define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
58
59#include <limits.h>
60
61// error strings for the regexp parser
62#define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
63#define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
64#define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
65#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
66#define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
67#define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
68#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
69#define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
70#define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
71
72/*
73 WARNING! Be sure to read qregexp.tex before modifying this file.
74*/
75
76/*!
77 \class QRegExp qregexp.h
78 \reentrant
79 \brief The QRegExp class provides pattern matching using regular expressions.
80
81 \ingroup tools
82 \ingroup misc
83 \ingroup shared
84 \mainclass
85 \keyword regular expression
86
87 Regular expressions, or "regexps", provide a way to find patterns
88 within text. This is useful in many contexts, for example:
89
90 \table
91 \row \i Validation
92 \i A regexp can be used to check whether a piece of text
93 meets some criteria, e.g. is an integer or contains no
94 whitespace.
95 \row \i Searching
96 \i Regexps provide a much more powerful means of searching
97 text than simple string matching does. For example we can
98 create a regexp which says "find one of the words 'mail',
99 'letter' or 'correspondence' but not any of the words
100 'email', 'mailman' 'mailer', 'letterbox' etc."
101 \row \i Search and Replace
102 \i A regexp can be used to replace a pattern with a piece of
103 text, for example replace all occurrences of '&' with
104 '\&amp;' except where the '&' is already followed by 'amp;'.
105 \row \i String Splitting
106 \i A regexp can be used to identify where a string should be
107 split into its component fields, e.g. splitting tab-delimited
108 strings.
109 \endtable
110
111 We present a very brief introduction to regexps, a description of
112 Qt's regexp language, some code examples, and finally the function
113 documentation itself. QRegExp is modeled on Perl's regexp
114 language, and also fully supports Unicode. QRegExp can also be
115 used in the weaker 'wildcard' (globbing) mode which works in a
116 similar way to command shells. A good text on regexps is \e
117 {Mastering Regular Expressions: Powerful Techniques for Perl and
118 Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
119
120 Experienced regexp users may prefer to skip the introduction and
121 go directly to the relevant information.
122
123 \tableofcontents
124
125 \section1 Introduction
126
127 Regexps are built up from expressions, quantifiers, and assertions.
128 The simplest form of expression is simply a character, e.g.
129 <b>x</b> or <b>5</b>. An expression can also be a set of
130 characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
131 a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
132 write this as <b>[A-D]</b>. If we want to match any of the
133 captital letters in the English alphabet we can write
134 <b>[A-Z]</b>. A quantifier tells the regexp engine how many
135 occurrences of the expression we want, e.g. <b>x{1,1}</b> means
136 match an <b>x</b> which occurs at least once and at most once.
137 We'll look at assertions and more complex expressions later.
138
139 Note that in general regexps cannot be used to check for balanced
140 brackets or tags. For example if you want to match an opening html
141 \c <b> and its closing \c </b> you can only use a regexp if you
142 know that these tags are not nested; the html fragment, \c{<b>bold
143 <b>bolder</b></b>} will not match as expected. If you know the
144 maximum level of nesting it is possible to create a regexp that
145 will match correctly, but for an unknown level of nesting, regexps
146 will fail.
147
148 We'll start by writing a regexp to match integers in the range 0
149 to 99. We will require at least one digit so we will start with
150 <b>[0-9]{1,1}</b> which means match a digit exactly once. This
151 regexp alone will match integers in the range 0 to 9. To match one
152 or two digits we can increase the maximum number of occurrences so
153 the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
154 least once and at most twice. However, this regexp as it stands
155 will not match correctly. This regexp will match one or two digits
156 \e within a string. To ensure that we match against the whole
157 string we must use the anchor assertions. We need <b>^</b> (caret)
158 which when it is the first character in the regexp means that the
159 regexp must match from the beginning of the string. And we also
160 need <b>$</b> (dollar) which when it is the last character in the
161 regexp means that the regexp must match until the end of the
162 string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
163 assertions, such as <b>^</b> and <b>$</b>, do not match any
164 characters.
165
166 If you've seen regexps elsewhere they may have looked different from
167 the ones above. This is because some sets of characters and some
168 quantifiers are so common that they have special symbols to
169 represent them. <b>[0-9]</b> can be replaced with the symbol
170 <b>\d</b>. The quantifier to match exactly one occurrence,
171 <b>{1,1}</b>, can be replaced with the expression itself. This means
172 that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
173 to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
174 writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
175 string match a digit followed by zero or one digits. In practice
176 most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
177 shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
178 occurrences a maximum of one occurrence. This is used to make an
179 expression optional. The regexp <b>^\d\d?$</b> means "from the
180 beginning of the string match one digit followed by zero or one
181 digits and then the end of the string".
182
183 Our second example is matching the words 'mail', 'letter' or
184 'correspondence' but without matching 'email', 'mailman',
185 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
186 full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
187 each expression itself is automatically quantified by <b>{1,1}</b>
188 we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
189 followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
190 used for \e alternation, so our regexp now becomes
191 <b>mail|letter|correspondence</b> which means match 'mail' \e or
192 'letter' \e or 'correspondence'. Whilst this regexp will find the
193 words we want it will also find words we don't want such as
194 'email'. We will start by putting our regexp in parentheses,
195 <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
196 firstly they group expressions together and secondly they identify
197 parts of the regexp that we wish to \link #capturing-text capture
198 \endlink. Our regexp still matches any of the three words but now
199 they are grouped together as a unit. This is useful for building
200 up more complex regexps. It is also useful because it allows us to
201 examine which of the words actually matched. We need to use
202 another assertion, this time <b>\b</b> "word boundary":
203 <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
204 a word boundary followed by the expression in parentheses followed
205 by another word boundary". The <b>\b</b> assertion matches at a \e
206 position in the regexp not a \e character in the regexp. A word
207 boundary is any non-word character such as a space a newline or
208 the beginning or end of the string.
209
210 For our third example we want to replace ampersands with the HTML
211 entity '\&amp;'. The regexp to match is simple: <b>\&</b>, i.e.
212 match one ampersand. Unfortunately this will mess up our text if
213 some of the ampersands have already been turned into HTML
214 entities. So what we really want to say is replace an ampersand
215 providing it is not followed by 'amp;'. For this we need the
216 negative lookahead assertion and our regexp becomes:
217 <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
218 with '(?!' and finishes at the ')'. It means that the text it
219 contains, 'amp;' in our example, must \e not follow the expression
220 that preceeds it.
221
222 Regexps provide a rich language that can be used in a variety of
223 ways. For example suppose we want to count all the occurrences of
224 'Eric' and 'Eirik' in a string. Two valid regexps to match these
225 are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
226 the word boundary '\b' so we don't get 'Ericsson' etc. The second
227 regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
228 and 'Eirik'.
229
230 We will implement some the examples above in the
231 \link #code-examples code examples \endlink section.
232
233 \target characters-and-abbreviations-for-sets-of-characters
234 \section1 Characters and Abbreviations for Sets of Characters
235
236 \table
237 \header \i Element \i Meaning
238 \row \i <b>c</b>
239 \i Any character represents itself unless it has a special
240 regexp meaning. Thus <b>c</b> matches the character \e c.
241 \row \i <b>\\c</b>
242 \i A character that follows a backslash matches the character
243 itself except where mentioned below. For example if you
244 wished to match a literal caret at the beginning of a string
245 you would write <b>\^</b>.
246 \row \i <b>\\a</b>
247 \i This matches the ASCII bell character (BEL, 0x07).
248 \row \i <b>\\f</b>
249 \i This matches the ASCII form feed character (FF, 0x0C).
250 \row \i <b>\\n</b>
251 \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
252 \row \i <b>\\r</b>
253 \i This matches the ASCII carriage return character (CR, 0x0D).
254 \row \i <b>\\t</b>
255 \i This matches the ASCII horizontal tab character (HT, 0x09).
256 \row \i <b>\\v</b>
257 \i This matches the ASCII vertical tab character (VT, 0x0B).
258 \row \i <b>\\xhhhh</b>
259 \i This matches the Unicode character corresponding to the
260 hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
261 (i.e., \zero ooo) matches the ASCII/Latin-1 character
262 corresponding to the octal number ooo (between 0 and 0377).
263 \row \i <b>. (dot)</b>
264 \i This matches any character (including newline).
265 \row \i <b>\\d</b>
266 \i This matches a digit (see QChar::isDigit()).
267 \row \i <b>\\D</b>
268 \i This matches a non-digit.
269 \row \i <b>\\s</b>
270 \i This matches a whitespace (see QChar::isSpace()).
271 \row \i <b>\\S</b>
272 \i This matches a non-whitespace.
273 \row \i <b>\\w</b>
274 \i This matches a word character (see QChar::isLetterOrNumber()).
275 \row \i <b>\\W</b>
276 \i This matches a non-word character.
277 \row \i <b>\\n</b>
278 \i The n-th \link #capturing-text backreference \endlink,
279 e.g. \1, \2, etc.
280 \endtable
281
282 \e {Note that the C++ compiler transforms backslashes in strings
283 so to include a <b>\\</b> in a regexp you will need to enter it
284 twice, i.e. <b>\\\\</b>.}
285
286 \target sets-of-characters
287 \section1 Sets of Characters
288
289 Square brackets are used to match any character in the set of
290 characters contained within the square brackets. All the character
291 set abbreviations described above can be used within square
292 brackets. Apart from the character set abbreviations and the
293 following two exceptions no characters have special meanings in
294 square brackets.
295
296 \table
297 \row \i <b>^</b>
298 \i The caret negates the character set if it occurs as the
299 first character, i.e. immediately after the opening square
300 bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
301 but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
302 'c'.
303 \row \i <b>-</b>
304 \i The dash is used to indicate a range of characters, for
305 example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
306 \endtable
307
308 Using the predefined character set abbreviations is more portable
309 than using character ranges across platforms and languages. For
310 example, <b>[0-9]</b> matches a digit in Western alphabets but
311 <b>\d</b> matches a digit in \e any alphabet.
312
313 Note that in most regexp literature sets of characters are called
314 "character classes".
315
316 \target quantifiers
317 \section1 Quantifiers
318
319 By default an expression is automatically quantified by
320 <b>{1,1}</b>, i.e. it should occur exactly once. In the following
321 list <b>\e {E}</b> stands for any expression. An expression is a
322 character or an abbreviation for a set of characters or a set of
323 characters in square brackets or any parenthesised expression.
324
325 \table
326 \row \i <b>\e {E}?</b>
327 \i Matches zero or one occurrence of \e E. This quantifier
328 means "the previous expression is optional" since it will
329 match whether or not the expression occurs in the string. It
330 is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
331 will match 'dent' and 'dents'.
332
333 \row \i <b>\e {E}+</b>
334 \i Matches one or more occurrences of \e E. This is the same
335 as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
336 '0', '00', '000', etc.
337
338 \row \i <b>\e {E}*</b>
339 \i Matches zero or more occurrences of \e E. This is the same
340 as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
341 used by a mistake. Since it matches \e zero or more
342 occurrences it will match no occurrences at all. For example
343 if we want to match strings that end in whitespace and use
344 the regexp <b>\s*$</b> we would get a match on every string.
345 This is because we have said find zero or more whitespace
346 followed by the end of string, so even strings that don't end
347 in whitespace will match. The regexp we want in this case is
348 <b>\s+$</b> to match strings that have at least one
349 whitespace at the end.
350
351 \row \i <b>\e {E}{n}</b>
352 \i Matches exactly \e n occurrences of the expression. This
353 is the same as repeating the expression \e n times. For
354 example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
355 the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
356
357 \row \i <b>\e {E}{n,}</b>
358 \i Matches at least \e n occurrences of the expression. This
359 is the same as <b>\e {E}{n,MAXINT}</b>.
360
361 \row \i <b>\e {E}{,m}</b>
362 \i Matches at most \e m occurrences of the expression. This
363 is the same as <b>\e {E}{0,m}</b>.
364
365 \row \i <b>\e {E}{n,m}</b>
366 \i Matches at least \e n occurrences of the expression and at
367 most \e m occurrences of the expression.
368 \endtable
369
370 (MAXINT is implementation dependent but will not be smaller than
371 1024.)
372
373 If we wish to apply a quantifier to more than just the preceding
374 character we can use parentheses to group characters together in
375 an expression. For example, <b>tag+</b> matches a 't' followed by
376 an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
377 at least one occurrence of 'tag'.
378
379 Note that quantifiers are "greedy". They will match as much text
380 as they can. For example, <b>0+</b> will match as many zeros as it
381 can from the first zero it finds, e.g. '2.<u>000</u>5'.
382 Quantifiers can be made non-greedy, see setMinimal().
383
384 \target capturing-text
385 \section1 Capturing Text
386
387 Parentheses allow us to group elements together so that we can
388 quantify and capture them. For example if we have the expression
389 <b>mail|letter|correspondence</b> that matches a string we know
390 that \e one of the words matched but not which one. Using
391 parentheses allows us to "capture" whatever is matched within
392 their bounds, so if we used <b>(mail|letter|correspondence)</b>
393 and matched this regexp against the string "I sent you some email"
394 we can use the cap() or capturedTexts() functions to extract the
395 matched characters, in this case 'mail'.
396
397 We can use captured text within the regexp itself. To refer to the
398 captured text we use \e backreferences which are indexed from 1,
399 the same as for cap(). For example we could search for duplicate
400 words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
401 word boundary followed by one or more word characters followed by
402 one or more non-word characters followed by the same text as the
403 first parenthesised expression followed by a word boundary.
404
405 If we want to use parentheses purely for grouping and not for
406 capturing we can use the non-capturing syntax, e.g.
407 <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
408 end ')'. In this example we match either 'green' or 'blue' but we
409 do not capture the match so we only know whether or not we matched
410 but not which color we actually found. Using non-capturing
411 parentheses is more efficient than using capturing parentheses
412 since the regexp engine has to do less book-keeping.
413
414 Both capturing and non-capturing parentheses may be nested.
415
416 \target assertions
417 \section1 Assertions
418
419 Assertions make some statement about the text at the point where
420 they occur in the regexp but they do not match any characters. In
421 the following list <b>\e {E}</b> stands for any expression.
422
423 \table
424 \row \i <b>^</b>
425 \i The caret signifies the beginning of the string. If you
426 wish to match a literal \c{^} you must escape it by
427 writing \c{\\^}. For example, <b>^#include</b> will only
428 match strings which \e begin with the characters '#include'.
429 (When the caret is the first character of a character set it
430 has a special meaning, see \link #sets-of-characters Sets of
431 Characters \endlink.)
432
433 \row \i <b>$</b>
434 \i The dollar signifies the end of the string. For example
435 <b>\d\s*$</b> will match strings which end with a digit
436 optionally followed by whitespace. If you wish to match a
437 literal \c{$} you must escape it by writing
438 \c{\\$}.
439
440 \row \i <b>\\b</b>
441 \i A word boundary. For example the regexp
442 <b>\\bOK\\b</b> means match immediately after a word
443 boundary (e.g. start of string or whitespace) the letter 'O'
444 then the letter 'K' immediately before another word boundary
445 (e.g. end of string or whitespace). But note that the
446 assertion does not actually match any whitespace so if we
447 write <b>(\\bOK\\b)</b> and we have a match it will only
448 contain 'OK' even if the string is "Its <u>OK</u> now".
449
450 \row \i <b>\\B</b>
451 \i A non-word boundary. This assertion is true wherever
452 <b>\\b</b> is false. For example if we searched for
453 <b>\\Bon\\B</b> in "Left on" the match would fail (space
454 and end of string aren't non-word boundaries), but it would
455 match in "t<u>on</u>ne".
456
457 \row \i <b>(?=\e E)</b>
458 \i Positive lookahead. This assertion is true if the
459 expression matches at this point in the regexp. For example,
460 <b>const(?=\\s+char)</b> matches 'const' whenever it is
461 followed by 'char', as in 'static <u>const</u> char *'.
462 (Compare with <b>const\\s+char</b>, which matches 'static
463 <u>const char</u> *'.)
464
465 \row \i <b>(?!\e E)</b>
466 \i Negative lookahead. This assertion is true if the
467 expression does not match at this point in the regexp. For
468 example, <b>const(?!\\s+char)</b> matches 'const' \e except
469 when it is followed by 'char'.
470 \endtable
471
472 \target wildcard-matching
473 \section1 Wildcard Matching (globbing)
474
475 Most command shells such as \e bash or \e cmd.exe support "file
476 globbing", the ability to identify a group of files by using
477 wildcards. The setWildcard() function is used to switch between
478 regexp and wildcard mode. Wildcard matching is much simpler than
479 full regexps and has only four features:
480
481 \table
482 \row \i <b>c</b>
483 \i Any character represents itself apart from those mentioned
484 below. Thus <b>c</b> matches the character \e c.
485 \row \i <b>?</b>
486 \i This matches any single character. It is the same as
487 <b>.</b> in full regexps.
488 \row \i <b>*</b>
489 \i This matches zero or more of any characters. It is the
490 same as <b>.*</b> in full regexps.
491 \row \i <b>[...]</b>
492 \i Sets of characters can be represented in square brackets,
493 similar to full regexps. Within the character class, like
494 outside, backslash has no special meaning.
495 \endtable
496
497 For example if we are in wildcard mode and have strings which
498 contain filenames we could identify HTML files with <b>*.html</b>.
499 This will match zero or more characters followed by a dot followed
500 by 'h', 't', 'm' and 'l'.
501
502 \target perl-users
503 \section1 Notes for Perl Users
504
505 Most of the character class abbreviations supported by Perl are
506 supported by QRegExp, see \link
507 #characters-and-abbreviations-for-sets-of-characters characters
508 and abbreviations for sets of characters \endlink.
509
510 In QRegExp, apart from within character classes, \c{^} always
511 signifies the start of the string, so carets must always be
512 escaped unless used for that purpose. In Perl the meaning of caret
513 varies automagically depending on where it occurs so escaping it
514 is rarely necessary. The same applies to \c{$} which in
515 QRegExp always signifies the end of the string.
516
517 QRegExp's quantifiers are the same as Perl's greedy quantifiers.
518 Non-greedy matching cannot be applied to individual quantifiers,
519 but can be applied to all the quantifiers in the pattern. For
520 example, to match the Perl regexp <b>ro+?m</b> requires:
521 \code
522 QRegExp rx( "ro+m" );
523 rx.setMinimal( TRUE );
524 \endcode
525
526 The equivalent of Perl's \c{/i} option is
527 setCaseSensitive(FALSE).
528
529 Perl's \c{/g} option can be emulated using a \link
530 #cap_in_a_loop loop \endlink.
531
532 In QRegExp <b>.</b> matches any character, therefore all QRegExp
533 regexps have the equivalent of Perl's \c{/s} option. QRegExp
534 does not have an equivalent to Perl's \c{/m} option, but this
535 can be emulated in various ways for example by splitting the input
536 into lines or by looping with a regexp that searches for newlines.
537
538 Because QRegExp is string oriented there are no \A, \Z or \z
539 assertions. The \G assertion is not supported but can be emulated
540 in a loop.
541
542 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
543 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
544 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
545 capturedTexts()[2], etc.
546
547 To substitute a pattern use QString::replace().
548
549 Perl's extended \c{/x} syntax is not supported, nor are
550 regexp comments (?#comment) or directives, e.g. (?i).
551
552 Both zero-width positive and zero-width negative lookahead
553 assertions (?=pattern) and (?!pattern) are supported with the same
554 syntax as Perl. Perl's lookbehind assertions, "independent"
555 subexpressions and conditional expressions are not supported.
556
557 Non-capturing parentheses are also supported, with the same
558 (?:pattern) syntax.
559
560 See QStringList::split() and QStringList::join() for equivalents
561 to Perl's split and join functions.
562
563 Note: because C++ transforms \\'s they must be written \e twice in
564 code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
565
566 \target code-examples
567 \section1 Code Examples
568
569 \code
570 QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
571 rx.search( "123" ); // returns -1 (no match)
572 rx.search( "-6" ); // returns -1 (no match)
573 rx.search( "6" ); // returns 0 (matched as position 0)
574 \endcode
575
576 The third string matches '<u>6</u>'. This is a simple validation
577 regexp for integers in the range 0 to 99.
578
579 \code
580 QRegExp rx( "^\\S+$" ); // match strings without whitespace
581 rx.search( "Hello world" ); // returns -1 (no match)
582 rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
583 \endcode
584
585 The second string matches '<u>This_is-OK</u>'. We've used the
586 character set abbreviation '\S' (non-whitespace) and the anchors
587 to match strings which contain no whitespace.
588
589 In the following example we match strings containing 'mail' or
590 'letter' or 'correspondence' but only match whole words i.e. not
591 'email'
592
593 \code
594 QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
595 rx.search( "I sent you an email" ); // returns -1 (no match)
596 rx.search( "Please write the letter" ); // returns 17
597 \endcode
598
599 The second string matches "Please write the <u>letter</u>". The
600 word 'letter' is also captured (because of the parentheses). We
601 can see what text we've captured like this:
602
603 \code
604 QString captured = rx.cap( 1 ); // captured == "letter"
605 \endcode
606
607 This will capture the text from the first set of capturing
608 parentheses (counting capturing left parentheses from left to
609 right). The parentheses are counted from 1 since cap( 0 ) is the
610 whole matched regexp (equivalent to '&' in most regexp engines).
611
612 \code
613 QRegExp rx( "&(?!amp;)" ); // match ampersands but not &amp;
614 QString line1 = "This & that";
615 line1.replace( rx, "&amp;" );
616 // line1 == "This &amp; that"
617 QString line2 = "His &amp; hers & theirs";
618 line2.replace( rx, "&amp;" );
619 // line2 == "His &amp; hers &amp; theirs"
620 \endcode
621
622 Here we've passed the QRegExp to QString's replace() function to
623 replace the matched text with new text.
624
625 \code
626 QString str = "One Eric another Eirik, and an Ericsson."
627 " How many Eiriks, Eric?";
628 QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
629 int pos = 0; // where we are in the string
630 int count = 0; // how many Eric and Eirik's we've counted
631 while ( pos >= 0 ) {
632 pos = rx.search( str, pos );
633 if ( pos >= 0 ) {
634 pos++; // move along in str
635 count++; // count our Eric or Eirik
636 }
637 }
638 \endcode
639
640 We've used the search() function to repeatedly match the regexp in
641 the string. Note that instead of moving forward by one character
642 at a time \c pos++ we could have written \c {pos +=
643 rx.matchedLength()} to skip over the already matched string. The
644 count will equal 3, matching 'One <u>Eric</u> another
645 <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
646 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
647 by non-word boundaries.
648
649 One common use of regexps is to split lines of delimited data into
650 their component fields.
651
652 \code
653 str = "Trolltech AS\twww.trolltech.com\tNorway";
654 QString company, web, country;
655 rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
656 if ( rx.search( str ) != -1 ) {
657 company = rx.cap( 1 );
658 web = rx.cap( 2 );
659 country = rx.cap( 3 );
660 }
661 \endcode
662
663 In this example our input lines have the format company name, web
664 address and country. Unfortunately the regexp is rather long and
665 not very versatile -- the code will break if we add any more
666 fields. A simpler and better solution is to look for the
667 separator, '\t' in this case, and take the surrounding text. The
668 QStringList split() function can take a separator string or regexp
669 as an argument and split a string accordingly.
670
671 \code
672 QStringList field = QStringList::split( "\t", str );
673 \endcode
674
675 Here field[0] is the company, field[1] the web address and so on.
676
677 To imitate the matching of a shell we can use wildcard mode.
678
679 \code
680 QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
681 rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
682 rx.search( "index.html" ); // returns 0 (matched at position 0)
683 rx.search( "default.htm" ); // returns -1 (no match)
684 rx.search( "readme.txt" ); // returns -1 (no match)
685 \endcode
686
687 Wildcard matching can be convenient because of its simplicity, but
688 any wildcard regexp can be defined using full regexps, e.g.
689 <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
690 .htm files with a wildcard unless we use <b>*.htm*</b> which will
691 also match 'test.html.bak'. A full regexp gives us the precision
692 we need, <b>.*\\.html?$</b>.
693
694 QRegExp can match case insensitively using setCaseSensitive(), and
695 can use non-greedy matching, see setMinimal(). By default QRegExp
696 uses full regexps but this can be changed with setWildcard().
697 Searching can be forward with search() or backward with
698 searchRev(). Captured text can be accessed using capturedTexts()
699 which returns a string list of all captured strings, or using
700 cap() which returns the captured string for the given index. The
701 pos() function takes a match index and returns the position in the
702 string where the match was made (or -1 if there was no match).
703
704 \sa QRegExpValidator QString QStringList
705
706 \target member-function-documentation
707*/
708
709const int NumBadChars = 64;
710#define BadChar( ch ) ( (ch).unicode() % NumBadChars )
711
712const int NoOccurrence = INT_MAX;
713const int EmptyCapture = INT_MAX;
714const int InftyLen = INT_MAX;
715const int InftyRep = 1025;
716const int EOS = -1;
717
718/*
719 Merges two QMemArrays of ints and puts the result into the first one.
720*/
721static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
722{
723 int asize = a->size();
724 int bsize = b.size();
725 if ( asize == 0 ) {
726 *a = b.copy();
727#ifndef QT_NO_REGEXP_OPTIM
728 } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
729 a->resize( asize + 1 );
730 (*a)[asize] = b[0];
731#endif
732 } else if ( bsize >= 1 ) {
733 int csize = asize + bsize;
734 QMemArray<int> c( csize );
735 int i = 0, j = 0, k = 0;
736 while ( i < asize ) {
737 if ( j < bsize ) {
738 if ( (*a)[i] == b[j] ) {
739 i++;
740 csize--;
741 } else if ( (*a)[i] < b[j] ) {
742 c[k++] = (*a)[i++];
743 } else {
744 c[k++] = b[j++];
745 }
746 } else {
747 memcpy( c.data() + k, (*a).data() + i,
748 (asize - i) * sizeof(int) );
749 break;
750 }
751 }
752 c.resize( csize );
753 if ( j < bsize )
754 memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
755 *a = c;
756 }
757}
758
759/*
760 Merges two disjoint QMaps of (int, int) pairs and puts the result into the
761 first one.
762*/
763static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
764{
765 QMap<int, int>::ConstIterator it;
766 for ( it = b.begin(); it != b.end(); ++it )
767 a->insert( it.key(), *it );
768}
769
770/*
771 Returns the value associated to key k in QMap m of (int, int) pairs, or 0 if
772 no such value is explicitly present.
773*/
774static int at( const QMap<int, int>& m, int k )
775{
776 QMap<int, int>::ConstIterator it = m.find( k );
777 if ( it == m.end() )
778 return 0;
779 else
780 return *it;
781}
782
783#ifndef QT_NO_REGEXP_WILDCARD
784/*
785 Translates a wildcard pattern to an equivalent regular expression pattern
786 (e.g., *.cpp to .*\.cpp).
787*/
788static QString wc2rx( const QString& wc_str )
789{
790 int wclen = wc_str.length();
791 QString rx = QString::fromLatin1( "" );
792 int i = 0;
793 const QChar *wc = wc_str.unicode();
794 while ( i < wclen ) {
795 QChar c = wc[i++];
796 switch ( c.unicode() ) {
797 case '*':
798 rx += QString::fromLatin1( ".*" );
799 break;
800 case '?':
801 rx += QChar( '.' );
802 break;
803 case '$':
804 case '(':
805 case ')':
806 case '+':
807 case '.':
808 case '\\':
809 case '^':
810 case '{':
811 case '|':
812 case '}':
813 rx += QChar( '\\' );
814 rx += c;
815 break;
816 case '[':
817 rx += c;
818 if ( wc[i] == QChar('^') )
819 rx += wc[i++];
820 if ( i < wclen ) {
821 if ( rx[i] == ']' )
822 rx += wc[i++];
823 while ( i < wclen && wc[i] != QChar(']') ) {
824 if ( wc[i] == '\\' )
825 rx += QChar( '\\' );
826 rx += wc[i++];
827 }
828 }
829 break;
830 default:
831 rx += c;
832 }
833 }
834 return rx;
835}
836#endif
837
838/*
839 The class QRegExpEngine encapsulates a modified nondeterministic
840 finite automaton (NFA).
841*/
842class QRegExpEngine : public QShared
843{
844public:
845#ifndef QT_NO_REGEXP_CCLASS
846 /*
847 The class CharClass represents a set of characters, such as can
848 be found in regular expressions (e.g., [a-z] denotes the set
849 {a, b, ..., z}).
850 */
851 class CharClass
852 {
853 public:
854 CharClass();
855 CharClass( const CharClass& cc ) { operator=( cc ); }
856
857 CharClass& operator=( const CharClass& cc );
858
859 void clear();
860 bool negative() const { return n; }
861 void setNegative( bool negative );
862 void addCategories( int cats );
863 void addRange( ushort from, ushort to );
864 void addSingleton( ushort ch ) { addRange( ch, ch ); }
865
866 bool in( QChar ch ) const;
867#ifndef QT_NO_REGEXP_OPTIM
868 const QMemArray<int>& firstOccurrence() const { return occ1; }
869#endif
870
871#if defined(QT_DEBUG)
872 void dump() const;
873#endif
874
875 private:
876 /*
877 The struct Range represents a range of characters (e.g.,
878 [0-9] denotes range 48 to 57).
879 */
880 struct Range
881 {
882 ushort from; // 48
883 ushort to; // 57
884 };
885
886 int c; // character classes
887 QMemArray<Range> r; // character ranges
888 bool n; // negative?
889#ifndef QT_NO_REGEXP_OPTIM
890 QMemArray<int> occ1; // first-occurrence array
891#endif
892 };
893#else
894 struct CharClass
895 {
896 int dummy;
897
898#ifndef QT_NO_REGEXP_OPTIM
899 CharClass() { occ1.fill( 0, NumBadChars ); }
900
901 const QMemArray<int>& firstOccurrence() const { return occ1; }
902 QMemArray<int> occ1;
903#endif
904 };
905#endif
906
907 QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
908 QRegExpEngine( const QString& rx, bool caseSensitive );
909#ifndef QT_NO_REGEXP_OPTIM
910 ~QRegExpEngine();
911#endif
912
913 bool isValid() const { return valid; }
914 const QString& errorString() const { return yyError; }
915 bool caseSensitive() const { return cs; }
916 int numCaptures() const { return officialncap; }
917 QMemArray<int> match( const QString& str, int pos, bool minimal,
918 bool oneTest, int caretIndex );
919 int matchedLength() const { return mmMatchedLen; }
920
921 int createState( QChar ch );
922 int createState( const CharClass& cc );
923#ifndef QT_NO_REGEXP_BACKREF
924 int createState( int bref );
925#endif
926
927 void addCatTransitions( const QMemArray<int>& from,
928 const QMemArray<int>& to );
929#ifndef QT_NO_REGEXP_CAPTURE
930 void addPlusTransitions( const QMemArray<int>& from,
931 const QMemArray<int>& to, int atom );
932#endif
933
934#ifndef QT_NO_REGEXP_ANCHOR_ALT
935 int anchorAlternation( int a, int b );
936 int anchorConcatenation( int a, int b );
937#else
938 int anchorAlternation( int a, int b ) { return a & b; }
939 int anchorConcatenation( int a, int b ) { return a | b; }
940#endif
941 void addAnchors( int from, int to, int a );
942
943#ifndef QT_NO_REGEXP_OPTIM
944 void setupGoodStringHeuristic( int earlyStart, int lateStart,
945 const QString& str );
946 void setupBadCharHeuristic( int minLen, const QMemArray<int>& firstOcc );
947 void heuristicallyChooseHeuristic();
948#endif
949
950#if defined(QT_DEBUG)
951 void dump() const;
952#endif
953
954private:
955 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
956
957 /*
958 The struct State represents one state in a modified NFA. The
959 input characters matched are stored in the state instead of on
960 the transitions, something possible for an automaton
961 constructed from a regular expression.
962 */
963 struct State
964 {
965#ifndef QT_NO_REGEXP_CAPTURE
966 int atom; // which atom does this state belong to?
967#endif
968 int match; // what does it match? (see CharClassBit and BackRefBit)
969 QMemArray<int> outs; // out-transitions
970 QMap<int, int> *reenter; // atoms reentered when transiting out
971 QMap<int, int> *anchors; // anchors met when transiting out
972
973#ifndef QT_NO_REGEXP_CAPTURE
974 State( int a, int m )
975 : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
976#else
977 State( int m )
978 : match( m ), reenter( 0 ), anchors( 0 ) { }
979#endif
980 ~State() { delete reenter; delete anchors; }
981 };
982
983#ifndef QT_NO_REGEXP_LOOKAHEAD
984 /*
985 The struct Lookahead represents a lookahead a la Perl (e.g., (?=foo) and
986 (?!bar)).
987 */
988 struct Lookahead
989 {
990 QRegExpEngine *eng; // NFA representing the embedded regular expression
991 bool neg; // negative lookahead?
992
993 Lookahead( QRegExpEngine *eng0, bool neg0 )
994 : eng( eng0 ), neg( neg0 ) { }
995 ~Lookahead() { delete eng; }
996 };
997#endif
998
999#ifndef QT_NO_REGEXP_CAPTURE
1000 /*
1001 The struct Atom represents one node in the hierarchy of regular
1002 expression atoms.
1003 */
1004 struct Atom
1005 {
1006 int parent; // index of parent in array of atoms
1007 int capture; // index of capture, from 1 to ncap
1008 };
1009#endif
1010
1011#ifndef QT_NO_REGEXP_ANCHOR_ALT
1012 /*
1013 The struct AnchorAlternation represents a pair of anchors with
1014 OR semantics.
1015 */
1016 struct AnchorAlternation
1017 {
1018 int a; // this anchor...
1019 int b; // ...or this one
1020 };
1021#endif
1022
1023 enum { InitialState = 0, FinalState = 1 };
1024 void setup( bool caseSensitive );
1025 int setupState( int match );
1026
1027 /*
1028 Let's hope that 13 lookaheads and 14 back-references are
1029 enough.
1030 */
1031 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1032 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
1033 Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
1034 Anchor_FirstLookahead = 0x00000010,
1035 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1036 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1037 Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
1038
1039 Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
1040 ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
1041#ifndef QT_NO_REGEXP_CAPTURE
1042 int startAtom( bool capture );
1043 void finishAtom( int atom ) { cf = f[atom].parent; }
1044#endif
1045
1046#ifndef QT_NO_REGEXP_LOOKAHEAD
1047 int addLookahead( QRegExpEngine *eng, bool negative );
1048#endif
1049
1050#ifndef QT_NO_REGEXP_CAPTURE
1051 bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
1052 const int *end2 );
1053#endif
1054 bool testAnchor( int i, int a, const int *capBegin );
1055
1056#ifndef QT_NO_REGEXP_OPTIM
1057 bool goodStringMatch();
1058 bool badCharMatch();
1059#else
1060 bool bruteMatch();
1061#endif
1062 bool matchHere();
1063
1064 QPtrVector<State> s; // array of states
1065 int ns; // number of states
1066#ifndef QT_NO_REGEXP_CAPTURE
1067 QMemArray<Atom> f; // atom hierarchy
1068 int nf; // number of atoms
1069 int cf; // current atom
1070#endif
1071 int officialncap; // number of captures, seen from the outside
1072 int ncap; // number of captures, seen from the inside
1073#ifndef QT_NO_REGEXP_CCLASS
1074 QPtrVector<CharClass> cl; // array of character classes
1075#endif
1076#ifndef QT_NO_REGEXP_LOOKAHEAD
1077 QPtrVector<Lookahead> ahead; // array of lookaheads
1078#endif
1079#ifndef QT_NO_REGEXP_ANCHOR_ALT
1080 QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
1081#endif
1082#ifndef QT_NO_REGEXP_OPTIM
1083 bool caretAnchored; // does the regexp start with ^?
1084#endif
1085 bool valid; // is the regular expression valid?
1086 bool cs; // case sensitive?
1087#ifndef QT_NO_REGEXP_BACKREF
1088 int nbrefs; // number of back-references
1089#endif
1090
1091#ifndef QT_NO_REGEXP_OPTIM
1092 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1093
1094 int goodEarlyStart; // the index where goodStr can first occur in a match
1095 int goodLateStart; // the index where goodStr can last occur in a match
1096 QString goodStr; // the string that any match has to contain
1097
1098 int minl; // the minimum length of a match
1099 QMemArray<int> occ1; // first-occurrence array
1100#endif
1101
1102 /*
1103 The class Box is an abstraction for a regular expression
1104 fragment. It can also be seen as one node in the syntax tree of
1105 a regular expression with synthetized attributes.
1106
1107 It's interface is ugly for performance reasons.
1108 */
1109 class Box
1110 {
1111 public:
1112 Box( QRegExpEngine *engine );
1113 Box( const Box& b ) { operator=( b ); }
1114
1115 Box& operator=( const Box& b );
1116
1117 void clear() { operator=(Box(eng)); }
1118 void set( QChar ch );
1119 void set( const CharClass& cc );
1120#ifndef QT_NO_REGEXP_BACKREF
1121 void set( int bref );
1122#endif
1123
1124 void cat( const Box& b );
1125 void orx( const Box& b );
1126 void plus( int atom );
1127 void opt();
1128 void catAnchor( int a );
1129#ifndef QT_NO_REGEXP_OPTIM
1130 void setupHeuristics();
1131#endif
1132
1133#if defined(QT_DEBUG)
1134 void dump() const;
1135#endif
1136
1137 private:
1138 void addAnchorsToEngine( const Box& to ) const;
1139
1140 QRegExpEngine *eng; // the automaton under construction
1141 QMemArray<int> ls; // the left states (firstpos)
1142 QMemArray<int> rs; // the right states (lastpos)
1143 QMap<int, int> lanchors; // the left anchors
1144 QMap<int, int> ranchors; // the right anchors
1145 int skipanchors; // the anchors to match if the box is skipped
1146
1147#ifndef QT_NO_REGEXP_OPTIM
1148 int earlyStart; // the index where str can first occur
1149 int lateStart; // the index where str can last occur
1150 QString str; // a string that has to occur in any match
1151 QString leftStr; // a string occurring at the left of this box
1152 QString rightStr; // a string occurring at the right of this box
1153 int maxl; // the maximum length of this box (possibly InftyLen)
1154#endif
1155
1156 int minl; // the minimum length of this box
1157#ifndef QT_NO_REGEXP_OPTIM
1158 QMemArray<int> occ1; // first-occurrence array
1159#endif
1160 };
1161 friend class Box;
1162
1163 /*
1164 This is the lexical analyzer for regular expressions.
1165 */
1166 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
1167 Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
1168 Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
1169 Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1170 int getChar();
1171 int getEscape();
1172#ifndef QT_NO_REGEXP_INTERVAL
1173 int getRep( int def );
1174#endif
1175#ifndef QT_NO_REGEXP_LOOKAHEAD
1176 void skipChars( int n );
1177#endif
1178 void error( const char *msg );
1179 void startTokenizer( const QChar *rx, int len );
1180 int getToken();
1181
1182 const QChar *yyIn; // a pointer to the input regular expression pattern
1183 int yyPos0; // the position of yyTok in the input pattern
1184 int yyPos; // the position of the next character to read
1185 int yyLen; // the length of yyIn
1186 int yyCh; // the last character read
1187 CharClass *yyCharClass; // attribute for Tok_CharClass tokens
1188 int yyMinRep; // attribute for Tok_Quantifier
1189 int yyMaxRep; // ditto
1190 QString yyError; // syntax error or overflow during parsing?
1191
1192 /*
1193 This is the syntactic analyzer for regular expressions.
1194 */
1195 int parse( const QChar *rx, int len );
1196 void parseAtom( Box *box );
1197 void parseFactor( Box *box );
1198 void parseTerm( Box *box );
1199 void parseExpression( Box *box );
1200
1201 int yyTok; // the last token read
1202 bool yyMayCapture; // set this to FALSE to disable capturing
1203
1204 /*
1205 This is the engine state during matching.
1206 */
1207 const QString *mmStr; // a pointer to the input QString
1208 const QChar *mmIn; // a pointer to the input string data
1209 int mmPos; // the current position in the string
1210 int mmCaretPos;
1211 int mmLen; // the length of the input string
1212 bool mmMinimal; // minimal matching?
1213 QMemArray<int> mmCaptured; // an array of pairs (start, len)
1214 QMemArray<int> mmCapturedNoMatch; // an array of pairs (-1, -1)
1215 QMemArray<int> mmBigArray; // big QMemArray<int> array
1216 int *mmInNextStack; // is state is mmNextStack?
1217 int *mmCurStack; // stack of current states
1218 int *mmNextStack; // stack of next states
1219 int *mmCurCapBegin; // start of current states' captures
1220 int *mmNextCapBegin; // start of next states' captures
1221 int *mmCurCapEnd; // end of current states' captures
1222 int *mmNextCapEnd; // end of next states' captures
1223 int *mmTempCapBegin; // start of temporary captures
1224 int *mmTempCapEnd; // end of temporary captures
1225 int *mmCapBegin; // start of captures for a next state
1226 int *mmCapEnd; // end of captures for a next state
1227 int *mmSlideTab; // bump-along slide table for bad-character heuristic
1228 int mmSlideTabSize; // size of slide table
1229#ifndef QT_NO_REGEXP_BACKREF
1230 QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
1231#endif
1232 int mmMatchLen; // length of match
1233 int mmMatchedLen; // length of partial match
1234};
1235
1236QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
1237#ifndef QT_NO_REGEXP_BACKREF
1238 : mmSleeping( 101 )
1239#endif
1240{
1241 setup( caseSensitive );
1242 valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
1243 if ( !valid )
1244 error( RXERR_LEFTDELIM );
1245}
1246
1247#ifndef QT_NO_REGEXP_OPTIM
1248QRegExpEngine::~QRegExpEngine()
1249{
1250}
1251#endif
1252
1253/*
1254 Tries to match in str and returns an array of (begin, length) pairs
1255 for captured text. If there is no match, all pairs are (-1, -1).
1256*/
1257QMemArray<int> QRegExpEngine::match( const QString& str, int pos, bool minimal,
1258 bool oneTest, int caretIndex )
1259{
1260 mmStr = &str;
1261 mmIn = str.unicode();
1262 if ( mmIn == 0 )
1263 mmIn = &QChar::null;
1264 mmPos = pos;
1265 mmCaretPos = caretIndex;
1266 mmLen = str.length();
1267 mmMinimal = minimal;
1268 mmMatchLen = 0;
1269 mmMatchedLen = 0;
1270
1271 bool matched = FALSE;
1272 if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
1273#ifndef QT_NO_REGEXP_OPTIM
1274 if ( oneTest ) {
1275 matched = matchHere();
1276 } else {
1277 if ( mmPos <= mmLen - minl ) {
1278 if ( caretAnchored ) {
1279 matched = matchHere();
1280 } else if ( useGoodStringHeuristic ) {
1281 matched = goodStringMatch();
1282 } else {
1283 matched = badCharMatch();
1284 }
1285 }
1286 }
1287#else
1288 matched = oneTest ? matchHere() : bruteMatch();
1289#endif
1290 }
1291
1292 if ( matched ) {
1293 mmCaptured.detach();
1294 mmCaptured[0] = mmPos;
1295 mmCaptured[1] = mmMatchLen;
1296 for ( int j = 0; j < officialncap; j++ ) {
1297 int len = mmCapEnd[j] - mmCapBegin[j];
1298 mmCaptured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
1299 mmCaptured[2 + 2 * j + 1] = len;
1300 }
1301 return mmCaptured;
1302 } else {
1303 return mmCapturedNoMatch;
1304 }
1305}
1306
1307/*
1308 The three following functions add one state to the automaton and
1309 return the number of the state.
1310*/
1311
1312int QRegExpEngine::createState( QChar ch )
1313{
1314 return setupState( ch.unicode() );
1315}
1316
1317int QRegExpEngine::createState( const CharClass& cc )
1318{
1319#ifndef QT_NO_REGEXP_CCLASS
1320 int n = cl.size();
1321 cl.resize( n + 1 );
1322 cl.insert( n, new CharClass(cc) );
1323 return setupState( CharClassBit | n );
1324#else
1325 Q_UNUSED( cc );
1326 return setupState( CharClassBit );
1327#endif
1328}
1329
1330#ifndef QT_NO_REGEXP_BACKREF
1331int QRegExpEngine::createState( int bref )
1332{
1333 if ( bref > nbrefs ) {
1334 nbrefs = bref;
1335 if ( nbrefs > MaxBackRefs ) {
1336 error( RXERR_LIMIT );
1337 return 0;
1338 }
1339 }
1340 return setupState( BackRefBit | bref );
1341}
1342#endif
1343
1344/*
1345 The two following functions add a transition between all pairs of
1346 states (i, j) where i is fond in from, and j is found in to.
1347
1348 Cat-transitions are distinguished from plus-transitions for
1349 capturing.
1350*/
1351
1352void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
1353 const QMemArray<int>& to )
1354{
1355 for ( int i = 0; i < (int) from.size(); i++ ) {
1356 State *st = s[from[i]];
1357 mergeInto( &st->outs, to );
1358 }
1359}
1360
1361#ifndef QT_NO_REGEXP_CAPTURE
1362void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
1363 const QMemArray<int>& to, int atom )
1364{
1365 for ( int i = 0; i < (int) from.size(); i++ ) {
1366 State *st = s[from[i]];
1367 QMemArray<int> oldOuts = st->outs.copy();
1368 mergeInto( &st->outs, to );
1369 if ( f[atom].capture >= 0 ) {
1370 if ( st->reenter == 0 )
1371 st->reenter = new QMap<int, int>;
1372 for ( int j = 0; j < (int) to.size(); j++ ) {
1373 if ( !st->reenter->contains(to[j]) &&
1374 oldOuts.bsearch(to[j]) < 0 )
1375 st->reenter->insert( to[j], atom );
1376 }
1377 }
1378 }
1379}
1380#endif
1381
1382#ifndef QT_NO_REGEXP_ANCHOR_ALT
1383/*
1384 Returns an anchor that means a OR b.
1385*/
1386int QRegExpEngine::anchorAlternation( int a, int b )
1387{
1388 if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
1389 return a & b;
1390
1391 int n = aa.size();
1392#ifndef QT_NO_REGEXP_OPTIM
1393 if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
1394 return Anchor_Alternation | ( n - 1 );
1395#endif
1396
1397 aa.resize( n + 1 );
1398 aa[n].a = a;
1399 aa[n].b = b;
1400 return Anchor_Alternation | n;
1401}
1402
1403/*
1404 Returns an anchor that means a AND b.
1405*/
1406int QRegExpEngine::anchorConcatenation( int a, int b )
1407{
1408 if ( ((a | b) & Anchor_Alternation) == 0 )
1409 return a | b;
1410 if ( (b & Anchor_Alternation) != 0 )
1411 qSwap( a, b );
1412
1413 int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
1414 int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
1415 return anchorAlternation( aprime, bprime );
1416}
1417#endif
1418
1419/*
1420 Adds anchor a on a transition caracterised by its from state and
1421 its to state.
1422*/
1423void QRegExpEngine::addAnchors( int from, int to, int a )
1424{
1425 State *st = s[from];
1426 if ( st->anchors == 0 )
1427 st->anchors = new QMap<int, int>;
1428 if ( st->anchors->contains(to) )
1429 a = anchorAlternation( (*st->anchors)[to], a );
1430 st->anchors->insert( to, a );
1431}
1432
1433#ifndef QT_NO_REGEXP_OPTIM
1434/*
1435 The two following functions provide the engine with the information
1436 needed by its matching heuristics.
1437*/
1438
1439void QRegExpEngine::setupGoodStringHeuristic( int earlyStart, int lateStart,
1440 const QString& str )
1441{
1442 goodEarlyStart = earlyStart;
1443 goodLateStart = lateStart;
1444 goodStr = cs ? str : str.lower();
1445}
1446
1447void QRegExpEngine::setupBadCharHeuristic( int minLen,
1448 const QMemArray<int>& firstOcc )
1449{
1450 minl = minLen;
1451 if ( cs ) {
1452 occ1 = firstOcc;
1453 } else {
1454 occ1.fill( 0, NumBadChars );
1455 }
1456}
1457
1458/*
1459 This function chooses between the good-string and the bad-character
1460 heuristics. It computes two scores and chooses the heuristic with
1461 the highest score.
1462
1463 Here are some common-sense constraints on the scores that should be
1464 respected if the formulas are ever modified: (1) If goodStr is
1465 empty, the good-string heuristic scores 0. (2) If the search is
1466 case insensitive, the good-string heuristic should be used, unless
1467 it scores 0. (Case insensitivity turns all entries of occ1 to 0.)
1468 (3) If (goodLateStart - goodEarlyStart) is big, the good-string
1469 heuristic should score less.
1470*/
1471void QRegExpEngine::heuristicallyChooseHeuristic()
1472{
1473 int i;
1474
1475 if ( minl == 0 )
1476 return;
1477
1478 /*
1479 Magic formula: The good string has to constitute a good
1480 proportion of the minimum-length string, and appear at a
1481 more-or-less known index.
1482 */
1483 int goodStringScore = ( 64 * goodStr.length() / minl ) -
1484 ( goodLateStart - goodEarlyStart );
1485
1486 /*
1487 Less magic formula: We pick a couple of characters at random,
1488 and check whether they are good or bad.
1489 */
1490 int badCharScore = 0;
1491 int step = QMAX( 1, NumBadChars / 32 );
1492 for ( i = 1; i < NumBadChars; i += step ) {
1493 if ( occ1[i] == NoOccurrence )
1494 badCharScore += minl;
1495 else
1496 badCharScore += occ1[i];
1497 }
1498 badCharScore /= minl;
1499
1500 useGoodStringHeuristic = ( goodStringScore > badCharScore );
1501}
1502#endif
1503
1504#if defined(QT_DEBUG)
1505void QRegExpEngine::dump() const
1506{
1507 int i, j;
1508 qDebug( "Case %ssensitive engine", cs ? "" : "in" );
1509 qDebug( " States" );
1510 for ( i = 0; i < ns; i++ ) {
1511 qDebug( " %d%s", i,
1512 i == InitialState ? " (initial)" :
1513 i == FinalState ? " (final)" : "" );
1514#ifndef QT_NO_REGEXP_CAPTURE
1515 qDebug( " in atom %d", s[i]->atom );
1516#endif
1517 int m = s[i]->match;
1518 if ( (m & CharClassBit) != 0 ) {
1519 qDebug( " match character class %d", m ^ CharClassBit );
1520#ifndef QT_NO_REGEXP_CCLASS
1521 cl[m ^ CharClassBit]->dump();
1522#else
1523 qDebug( " negative character class" );
1524#endif
1525 } else if ( (m & BackRefBit) != 0 ) {
1526 qDebug( " match back-reference %d", m ^ BackRefBit );
1527 } else if ( m >= 0x20 && m <= 0x7e ) {
1528 qDebug( " match 0x%.4x (%c)", m, m );
1529 } else {
1530 qDebug( " match 0x%.4x", m );
1531 }
1532 for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
1533 int next = s[i]->outs[j];
1534 qDebug( " -> %d", next );
1535 if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
1536 qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
1537 if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
1538 qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
1539 }
1540 }
1541#ifndef QT_NO_REGEXP_CAPTURE
1542 if ( nf > 0 ) {
1543 qDebug( " Atom Parent Capture" );
1544 for ( i = 0; i < nf; i++ )
1545 qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
1546 }
1547#endif
1548#ifndef QT_NO_REGEXP_ANCHOR_ALT
1549 for ( i = 0; i < (int) aa.size(); i++ )
1550 qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
1551 aa[i].b );
1552#endif
1553}
1554#endif
1555
1556void QRegExpEngine::setup( bool caseSensitive )
1557{
1558 s.setAutoDelete( TRUE );
1559 s.resize( 32 );
1560 ns = 0;
1561#ifndef QT_NO_REGEXP_CAPTURE
1562 f.resize( 32 );
1563 nf = 0;
1564 cf = -1;
1565#endif
1566 officialncap = 0;
1567 ncap = 0;
1568#ifndef QT_NO_REGEXP_CCLASS
1569 cl.setAutoDelete( TRUE );
1570#endif
1571#ifndef QT_NO_REGEXP_LOOKAHEAD
1572 ahead.setAutoDelete( TRUE );
1573#endif
1574#ifndef QT_NO_REGEXP_OPTIM
1575 caretAnchored = TRUE;
1576#endif
1577 valid = FALSE;
1578 cs = caseSensitive;
1579#ifndef QT_NO_REGEXP_BACKREF
1580 nbrefs = 0;
1581#endif
1582#ifndef QT_NO_REGEXP_OPTIM
1583 useGoodStringHeuristic = FALSE;
1584 minl = 0;
1585 occ1.fill( 0, NumBadChars );
1586#endif
1587 mmCapturedNoMatch.fill( -1, 2 );
1588}
1589
1590int QRegExpEngine::setupState( int match )
1591{
1592 if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
1593 s.resize( (ns + 1) << 1 );
1594#ifndef QT_NO_REGEXP_CAPTURE
1595 s.insert( ns, new State(cf, match) );
1596#else
1597 s.insert( ns, new State(match) );
1598#endif
1599 return ns++;
1600}
1601
1602#ifndef QT_NO_REGEXP_CAPTURE
1603/*
1604 Functions startAtom() and finishAtom() should be called to delimit
1605 atoms. When a state is created, it is assigned to the current atom.
1606 The information is later used for capturing.
1607*/
1608int QRegExpEngine::startAtom( bool capture )
1609{
1610 if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
1611 f.resize( (nf + 1) << 1 );
1612 f[nf].parent = cf;
1613 cf = nf++;
1614 f[cf].capture = capture ? ncap++ : -1;
1615 return cf;
1616}
1617#endif
1618
1619#ifndef QT_NO_REGEXP_LOOKAHEAD
1620/*
1621 Creates a lookahead anchor.
1622*/
1623int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
1624{
1625 int n = ahead.size();
1626 if ( n == MaxLookaheads ) {
1627 error( RXERR_LIMIT );
1628 return 0;
1629 }
1630 ahead.resize( n + 1 );
1631 ahead.insert( n, new Lookahead(eng, negative) );
1632 return Anchor_FirstLookahead << n;
1633}
1634#endif
1635
1636#ifndef QT_NO_REGEXP_CAPTURE
1637/*
1638 We want the longest leftmost captures.
1639*/
1640bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
1641 const int *begin2, const int *end2 )
1642{
1643 for ( int i = 0; i < ncap; i++ ) {
1644 int delta = begin2[i] - begin1[i]; // it has to start early...
1645 if ( delta == 0 )
1646 delta = end1[i] - end2[i]; // ...and end late (like a party)
1647
1648 if ( delta != 0 )
1649 return delta > 0;
1650 }
1651 return FALSE;
1652}
1653#endif
1654
1655/*
1656 Returns TRUE if anchor a matches at position mmPos + i in the input
1657 string, otherwise FALSE.
1658*/
1659bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
1660{
1661 int j;
1662
1663#ifndef QT_NO_REGEXP_ANCHOR_ALT
1664 if ( (a & Anchor_Alternation) != 0 ) {
1665 return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
1666 testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
1667 }
1668#endif
1669
1670 if ( (a & Anchor_Caret) != 0 ) {
1671 if ( mmPos + i != mmCaretPos )
1672 return FALSE;
1673 }
1674 if ( (a & Anchor_Dollar) != 0 ) {
1675 if ( mmPos + i != mmLen )
1676 return FALSE;
1677 }
1678#ifndef QT_NO_REGEXP_ESCAPE
1679 if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
1680 bool before = FALSE;
1681 bool after = FALSE;
1682 if ( mmPos + i != 0 )
1683 before = mmIn[mmPos + i - 1].isLetterOrNumber();
1684 if ( mmPos + i != mmLen )
1685 after = mmIn[mmPos + i].isLetterOrNumber();
1686 if ( (a & Anchor_Word) != 0 && (before == after) )
1687 return FALSE;
1688 if ( (a & Anchor_NonWord) != 0 && (before != after) )
1689 return FALSE;
1690 }
1691#endif
1692#ifndef QT_NO_REGEXP_LOOKAHEAD
1693 bool catchx = TRUE;
1694
1695 if ( (a & Anchor_LookaheadMask) != 0 ) {
1696 QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
1697 mmLen - mmPos - i );
1698 for ( j = 0; j < (int) ahead.size(); j++ ) {
1699 if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
1700 catchx = ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
1701 mmCaretPos - mmPos - i )[0] == 0;
1702 if ( catchx == ahead[j]->neg )
1703 return FALSE;
1704 }
1705 }
1706 }
1707#endif
1708#ifndef QT_NO_REGEXP_CAPTURE
1709#ifndef QT_NO_REGEXP_BACKREF
1710 for ( j = 0; j < nbrefs; j++ ) {
1711 if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
1712 if ( capBegin[j] != EmptyCapture )
1713 return FALSE;
1714 }
1715 }
1716#endif
1717#endif
1718 return TRUE;
1719}
1720
1721#ifndef QT_NO_REGEXP_OPTIM
1722/*
1723 The three following functions are what Jeffrey Friedl would call
1724 transmissions (or bump-alongs). Using one or the other should make
1725 no difference except in performance.
1726*/
1727
1728bool QRegExpEngine::goodStringMatch()
1729{
1730 int k = mmPos + goodEarlyStart;
1731
1732 while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
1733 int from = k - goodLateStart;
1734 int to = k - goodEarlyStart;
1735 if ( from > mmPos )
1736 mmPos = from;
1737
1738 while ( mmPos <= to ) {
1739 if ( matchHere() )
1740 return TRUE;
1741 mmPos++;
1742 }
1743 k++;
1744 }
1745 return FALSE;
1746}
1747
1748bool QRegExpEngine::badCharMatch()
1749{
1750 int slideHead = 0;
1751 int slideNext = 0;
1752 int i;
1753 int lastPos = mmLen - minl;
1754 memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
1755
1756 /*
1757 Set up the slide table, used for the bad-character heuristic,
1758 using the table of first occurrence of each character.
1759 */
1760 for ( i = 0; i < minl; i++ ) {
1761 int sk = occ1[BadChar(mmIn[mmPos + i])];
1762 if ( sk == NoOccurrence )
1763 sk = i + 1;
1764 if ( sk > 0 ) {
1765 int k = i + 1 - sk;
1766 if ( k < 0 ) {
1767 sk = i + 1;
1768 k = 0;
1769 }
1770 if ( sk > mmSlideTab[k] )
1771 mmSlideTab[k] = sk;
1772 }
1773 }
1774
1775 if ( mmPos > lastPos )
1776 return FALSE;
1777
1778 for ( ;; ) {
1779 if ( ++slideNext >= mmSlideTabSize )
1780 slideNext = 0;
1781 if ( mmSlideTab[slideHead] > 0 ) {
1782 if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
1783 mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
1784 mmSlideTab[slideHead] = 0;
1785 } else {
1786 if ( matchHere() )
1787 return TRUE;
1788 }
1789
1790 if ( mmPos == lastPos )
1791 break;
1792
1793 /*
1794 Update the slide table. This code has much in common with
1795 the initialization code.
1796 */
1797 int sk = occ1[BadChar(mmIn[mmPos + minl])];
1798 if ( sk == NoOccurrence ) {
1799 mmSlideTab[slideNext] = minl;
1800 } else if ( sk > 0 ) {
1801 int k = slideNext + minl - sk;
1802 if ( k >= mmSlideTabSize )
1803 k -= mmSlideTabSize;
1804 if ( sk > mmSlideTab[k] )
1805 mmSlideTab[k] = sk;
1806 }
1807 slideHead = slideNext;
1808 mmPos++;
1809 }
1810 return FALSE;
1811}
1812#else
1813bool QRegExpEngine::bruteMatch()
1814{
1815 while ( mmPos <= mmLen ) {
1816 if ( matchHere() )
1817 return TRUE;
1818 mmPos++;
1819 }
1820 return FALSE;
1821}
1822#endif
1823
1824/*
1825 Here's the core of the engine. It tries to do a match here and now.
1826*/
1827bool QRegExpEngine::matchHere()
1828{
1829 int ncur = 1, nnext = 0;
1830 int i = 0, j, k, m;
1831 bool stop = FALSE;
1832
1833 mmMatchLen = -1;
1834 mmMatchedLen = -1;
1835 mmCurStack[0] = InitialState;
1836
1837#ifndef QT_NO_REGEXP_CAPTURE
1838 if ( ncap > 0 ) {
1839 for ( j = 0; j < ncap; j++ ) {
1840 mmCurCapBegin[j] = EmptyCapture;
1841 mmCurCapEnd[j] = EmptyCapture;
1842 }
1843 }
1844#endif
1845
1846#ifndef QT_NO_REGEXP_BACKREF
1847 int *zzZ = 0;
1848
1849 while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
1850 !stop )
1851#else
1852 while ( ncur > 0 && i <= mmLen - mmPos && !stop )
1853#endif
1854 {
1855 int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
1856 for ( j = 0; j < ncur; j++ ) {
1857 int cur = mmCurStack[j];
1858 State *scur = s[cur];
1859 QMemArray<int>& outs = scur->outs;
1860 for ( k = 0; k < (int) outs.size(); k++ ) {
1861 int next = outs[k];
1862 State *snext = s[next];
1863 bool in = TRUE;
1864#ifndef QT_NO_REGEXP_BACKREF
1865 int needSomeSleep = 0;
1866#endif
1867
1868 /*
1869 First, check if the anchors are anchored properly.
1870 */
1871 if ( scur->anchors != 0 ) {
1872 int a = at( *scur->anchors, next );
1873 if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
1874 in = FALSE;
1875 }
1876 /*
1877 If indeed they are, check if the input character is
1878 correct for this transition.
1879 */
1880 if ( in ) {
1881 m = snext->match;
1882 if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
1883 if ( cs )
1884 in = ( m == ch );
1885 else
1886 in = ( QChar(m).lower() == QChar(ch).lower() );
1887 } else if ( next == FinalState ) {
1888 mmMatchLen = i;
1889 stop = mmMinimal;
1890 in = TRUE;
1891 } else if ( (m & CharClassBit) != 0 ) {
1892#ifndef QT_NO_REGEXP_CCLASS
1893 const CharClass *cc = cl[m ^ CharClassBit];
1894 if ( cs )
1895 in = cc->in( ch );
1896 else if ( cc->negative() )
1897 in = cc->in( QChar(ch).lower() ) &&
1898 cc->in( QChar(ch).upper() );
1899 else
1900 in = cc->in( QChar(ch).lower() ) ||
1901 cc->in( QChar(ch).upper() );
1902#endif
1903#ifndef QT_NO_REGEXP_BACKREF
1904 } else { /* ( (m & BackRefBit) != 0 ) */
1905 int bref = m ^ BackRefBit;
1906 int ell = j * ncap + ( bref - 1 );
1907
1908 in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
1909 if ( in ) {
1910 if ( cs )
1911 in = ( mmIn[mmPos + mmCurCapBegin[ell]]
1912 == QChar(ch) );
1913 else
1914 in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
1915 == QChar(ch).lower() );
1916 }
1917
1918 if ( in ) {
1919 int delta;
1920 if ( mmCurCapEnd[ell] == EmptyCapture )
1921 delta = i - mmCurCapBegin[ell];
1922 else
1923 delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
1924
1925 in = ( delta <= mmLen - (mmPos + i) );
1926 if ( in && delta > 1 ) {
1927 int n = 1;
1928 if ( cs ) {
1929 while ( n < delta ) {
1930 if ( mmIn[mmPos +
1931 mmCurCapBegin[ell] + n] !=
1932 mmIn[mmPos + i + n] )
1933 break;
1934 n++;
1935 }
1936 } else {
1937 while ( n < delta ) {
1938 QChar a = mmIn[mmPos +
1939 mmCurCapBegin[ell] + n];
1940 QChar b = mmIn[mmPos + i + n];
1941 if ( a.lower() != b.lower() )
1942 break;
1943 n++;
1944 }
1945 }
1946 in = ( n == delta );
1947 if ( in )
1948 needSomeSleep = delta - 1;
1949 }
1950 }
1951#endif
1952 }
1953 }
1954
1955 /*
1956 We must now update our data structures.
1957 */
1958 if ( in ) {
1959#ifndef QT_NO_REGEXP_CAPTURE
1960 int *capBegin, *capEnd;
1961#endif
1962 /*
1963 If the next state was not encountered yet, all
1964 is fine.
1965 */
1966 if ( (m = mmInNextStack[next]) == -1 ) {
1967 m = nnext++;
1968 mmNextStack[m] = next;
1969 mmInNextStack[next] = m;
1970#ifndef QT_NO_REGEXP_CAPTURE
1971 capBegin = mmNextCapBegin + m * ncap;
1972 capEnd = mmNextCapEnd + m * ncap;
1973
1974 /*
1975 Otherwise, we'll first maintain captures in
1976 temporary arrays, and decide at the end whether
1977 it's best to keep the previous capture zones or
1978 the new ones.
1979 */
1980 } else {
1981 capBegin = mmTempCapBegin;
1982 capEnd = mmTempCapEnd;
1983#endif
1984 }
1985
1986#ifndef QT_NO_REGEXP_CAPTURE
1987 /*
1988 Updating the capture zones is much of a task.
1989 */
1990 if ( ncap > 0 ) {
1991 memcpy( capBegin, mmCurCapBegin + j * ncap,
1992 ncap * sizeof(int) );
1993 memcpy( capEnd, mmCurCapEnd + j * ncap,
1994 ncap * sizeof(int) );
1995 int c = scur->atom, n = snext->atom;
1996 int p = -1, q = -1;
1997 int cap;
1998
1999 /*
2000 Lemma 1. For any x in the range [0..nf), we
2001 have f[x].parent < x.
2002
2003 Proof. By looking at startAtom(), it is
2004 clear that cf < nf holds all the time, and
2005 thus that f[nf].parent < nf.
2006 */
2007
2008 /*
2009 If we are reentering an atom, we empty all
2010 capture zones inside it.
2011 */
2012 if ( scur->reenter != 0 &&
2013 (q = at(*scur->reenter, next)) != 0 ) {
2014 QBitArray b;
2015 b.fill( FALSE, nf );
2016 b.setBit( q, TRUE );
2017 for ( int ell = q + 1; ell < nf; ell++ ) {
2018 if ( b.testBit(f[ell].parent) ) {
2019 b.setBit( ell, TRUE );
2020 cap = f[ell].capture;
2021 if ( cap >= 0 ) {
2022 capBegin[cap] = EmptyCapture;
2023 capEnd[cap] = EmptyCapture;
2024 }
2025 }
2026 }
2027 p = f[q].parent;
2028
2029 /*
2030 Otherwise, close the capture zones we are
2031 leaving. We are leaving f[c].capture,
2032 f[f[c].parent].capture,
2033 f[f[f[c].parent].parent].capture, ...,
2034 until f[x].capture, with x such that
2035 f[x].parent is the youngest common ancestor
2036 for c and n.
2037
2038 We go up along c's and n's ancestry until
2039 we find x.
2040 */
2041 } else {
2042 p = c;
2043 q = n;
2044 while ( p != q ) {
2045 if ( p > q ) {
2046 cap = f[p].capture;
2047 if ( cap >= 0 ) {
2048 if ( capBegin[cap] == i ) {
2049 capBegin[cap] = EmptyCapture;
2050 capEnd[cap] = EmptyCapture;
2051 } else {
2052 capEnd[cap] = i;
2053 }
2054 }
2055 p = f[p].parent;
2056 } else {
2057 q = f[q].parent;
2058 }
2059 }
2060 }
2061
2062 /*
2063 In any case, we now open the capture zones
2064 we are entering. We work upwards from n
2065 until we reach p (the parent of the atom we
2066 reenter or the youngest common ancestor).
2067 */
2068 while ( n > p ) {
2069 cap = f[n].capture;
2070 if ( cap >= 0 ) {
2071 capBegin[cap] = i;
2072 capEnd[cap] = EmptyCapture;
2073 }
2074 n = f[n].parent;
2075 }
2076 /*
2077 If the next state was already in
2078 mmNextStack, we must choose carefully which
2079 capture zones we want to keep.
2080 */
2081 if ( capBegin == mmTempCapBegin &&
2082 isBetterCapture(capBegin, capEnd,
2083 mmNextCapBegin + m * ncap,
2084 mmNextCapEnd + m * ncap) ) {
2085 memcpy( mmNextCapBegin + m * ncap, capBegin,
2086 ncap * sizeof(int) );
2087 memcpy( mmNextCapEnd + m * ncap, capEnd,
2088 ncap * sizeof(int) );
2089 }
2090 }
2091#ifndef QT_NO_REGEXP_BACKREF
2092 /*
2093 We are done with updating the capture zones.
2094 It's now time to put the next state to sleep,
2095 if it needs to, and to remove it from
2096 mmNextStack.
2097 */
2098 if ( needSomeSleep > 0 ) {
2099 zzZ = new int[1 + 2 * ncap];
2100 zzZ[0] = next;
2101 if ( ncap > 0 ) {
2102 memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
2103 memcpy( zzZ + 1 + ncap, capEnd,
2104 ncap * sizeof(int) );
2105 }
2106 mmInNextStack[mmNextStack[--nnext]] = -1;
2107 mmSleeping.insert( i + needSomeSleep, zzZ );
2108 }
2109#endif
2110#endif
2111 }
2112 }
2113 }
2114#ifndef QT_NO_REGEXP_CAPTURE
2115 /*
2116 If we reached the final state, hurray! Copy the captured
2117 zone.
2118 */
2119 if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
2120 memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
2121 memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
2122 }
2123#ifndef QT_NO_REGEXP_BACKREF
2124 /*
2125 It's time to wake up the sleepers.
2126 */
2127 if ( !mmSleeping.isEmpty() ) {
2128 while ( (zzZ = mmSleeping.take(i)) != 0 ) {
2129 int next = zzZ[0];
2130 int *capBegin = zzZ + 1;
2131 int *capEnd = zzZ + 1 + ncap;
2132 bool copyOver = TRUE;
2133
2134 if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
2135 m = nnext++;
2136 mmNextStack[m] = next;
2137 mmInNextStack[next] = m;
2138 } else {
2139 copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
2140 mmNextCapEnd + m * ncap,
2141 capBegin, capEnd );
2142 }
2143 if ( copyOver ) {
2144 memcpy( mmNextCapBegin + m * ncap, capBegin,
2145 ncap * sizeof(int) );
2146 memcpy( mmNextCapEnd + m * ncap, capEnd,
2147 ncap * sizeof(int) );
2148 }
2149 delete[] zzZ;
2150 }
2151 }
2152#endif
2153#endif
2154 for ( j = 0; j < nnext; j++ )
2155 mmInNextStack[mmNextStack[j]] = -1;
2156
2157 // avoid needless iteration that confuses mmMatchedLen
2158 if ( nnext == 1 && mmNextStack[0] == FinalState
2159#ifndef QT_NO_REGEXP_BACKREF
2160 && mmSleeping.isEmpty()
2161#endif
2162 )
2163 stop = TRUE;
2164
2165 qSwap( mmCurStack, mmNextStack );
2166#ifndef QT_NO_REGEXP_CAPTURE
2167 qSwap( mmCurCapBegin, mmNextCapBegin );
2168 qSwap( mmCurCapEnd, mmNextCapEnd );
2169#endif
2170 ncur = nnext;
2171 nnext = 0;
2172 i++;
2173 }
2174
2175#ifndef QT_NO_REGEXP_BACKREF
2176 /*
2177 If minimal matching is enabled, we might have some sleepers
2178 left.
2179 */
2180 while ( !mmSleeping.isEmpty() ) {
2181 zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
2182 delete[] zzZ;
2183 }
2184#endif
2185
2186 mmMatchedLen = i - 1;
2187 return ( mmMatchLen >= 0 );
2188}
2189
2190#ifndef QT_NO_REGEXP_CCLASS
2191
2192QRegExpEngine::CharClass::CharClass()
2193 : c( 0 ), n( FALSE )
2194{
2195#ifndef QT_NO_REGEXP_OPTIM
2196 occ1.fill( NoOccurrence, NumBadChars );
2197#endif
2198}
2199
2200QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
2201 const CharClass& cc )
2202{
2203 c = cc.c;
2204 r = cc.r.copy();
2205 n = cc.n;
2206#ifndef QT_NO_REGEXP_OPTIM
2207 occ1 = cc.occ1;
2208#endif
2209 return *this;
2210}
2211
2212void QRegExpEngine::CharClass::clear()
2213{
2214 c = 0;
2215 r.resize( 0 );
2216 n = FALSE;
2217}
2218
2219void QRegExpEngine::CharClass::setNegative( bool negative )
2220{
2221 n = negative;
2222#ifndef QT_NO_REGEXP_OPTIM
2223 occ1.fill( 0, NumBadChars );
2224#endif
2225}
2226
2227void QRegExpEngine::CharClass::addCategories( int cats )
2228{
2229 c |= cats;
2230#ifndef QT_NO_REGEXP_OPTIM
2231 occ1.fill( 0, NumBadChars );
2232#endif
2233}
2234
2235void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
2236{
2237 if ( from > to )
2238 qSwap( from, to );
2239 int m = r.size();
2240 r.resize( m + 1 );
2241 r[m].from = from;
2242 r[m].to = to;
2243
2244#ifndef QT_NO_REGEXP_OPTIM
2245 int i;
2246
2247 if ( to - from < NumBadChars ) {
2248 occ1.detach();
2249 if ( from % NumBadChars <= to % NumBadChars ) {
2250 for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
2251 occ1[i] = 0;
2252 } else {
2253 for ( i = 0; i <= to % NumBadChars; i++ )
2254 occ1[i] = 0;
2255 for ( i = from % NumBadChars; i < NumBadChars; i++ )
2256 occ1[i] = 0;
2257 }
2258 } else {
2259 occ1.fill( 0, NumBadChars );
2260 }
2261#endif
2262}
2263
2264bool QRegExpEngine::CharClass::in( QChar ch ) const
2265{
2266#ifndef QT_NO_REGEXP_OPTIM
2267 if ( occ1[BadChar(ch)] == NoOccurrence )
2268 return n;
2269#endif
2270
2271 if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
2272 return !n;
2273 for ( int i = 0; i < (int) r.size(); i++ ) {
2274 if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
2275 return !n;
2276 }
2277 return n;
2278}
2279
2280#if defined(QT_DEBUG)
2281void QRegExpEngine::CharClass::dump() const
2282{
2283 int i;
2284 qDebug( " %stive character class", n ? "nega" : "posi" );
2285#ifndef QT_NO_REGEXP_CCLASS
2286 if ( c != 0 )
2287 qDebug( " categories 0x%.8x", c );
2288#endif
2289 for ( i = 0; i < (int) r.size(); i++ )
2290 qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
2291}
2292#endif
2293#endif
2294
2295QRegExpEngine::Box::Box( QRegExpEngine *engine )
2296 : eng( engine ), skipanchors( 0 )
2297#ifndef QT_NO_REGEXP_OPTIM
2298 , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
2299#endif
2300{
2301#ifndef QT_NO_REGEXP_OPTIM
2302 occ1.fill( NoOccurrence, NumBadChars );
2303#endif
2304 minl = 0;
2305}
2306
2307QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
2308{
2309 eng = b.eng;
2310 ls = b.ls;
2311 rs = b.rs;
2312 lanchors = b.lanchors;
2313 ranchors = b.ranchors;
2314 skipanchors = b.skipanchors;
2315#ifndef QT_NO_REGEXP_OPTIM
2316 earlyStart = b.earlyStart;
2317 lateStart = b.lateStart;
2318 str = b.str;
2319 leftStr = b.leftStr;
2320 rightStr = b.rightStr;
2321 maxl = b.maxl;
2322 occ1 = b.occ1;
2323#endif
2324 minl = b.minl;
2325 return *this;
2326}
2327
2328void QRegExpEngine::Box::set( QChar ch )
2329{
2330 ls.resize( 1 );
2331 ls[0] = eng->createState( ch );
2332 rs = ls;
2333 rs.detach();
2334#ifndef QT_NO_REGEXP_OPTIM
2335 str = ch;
2336 leftStr = ch;
2337 rightStr = ch;
2338 maxl = 1;
2339 occ1.detach();
2340 occ1[BadChar(ch)] = 0;
2341#endif
2342 minl = 1;
2343}
2344
2345void QRegExpEngine::Box::set( const CharClass& cc )
2346{
2347 ls.resize( 1 );
2348 ls[0] = eng->createState( cc );
2349 rs = ls;
2350 rs.detach();
2351#ifndef QT_NO_REGEXP_OPTIM
2352 maxl = 1;
2353 occ1 = cc.firstOccurrence();
2354#endif
2355 minl = 1;
2356}
2357
2358#ifndef QT_NO_REGEXP_BACKREF
2359void QRegExpEngine::Box::set( int bref )
2360{
2361 ls.resize( 1 );
2362 ls[0] = eng->createState( bref );
2363 rs = ls;
2364 rs.detach();
2365 if ( bref >= 1 && bref <= MaxBackRefs )
2366 skipanchors = Anchor_BackRef0Empty << bref;
2367#ifndef QT_NO_REGEXP_OPTIM
2368 maxl = InftyLen;
2369#endif
2370 minl = 0;
2371}
2372#endif
2373
2374void QRegExpEngine::Box::cat( const Box& b )
2375{
2376 eng->addCatTransitions( rs, b.ls );
2377 addAnchorsToEngine( b );
2378 if ( minl == 0 ) {
2379 mergeInto( &lanchors, b.lanchors );
2380 if ( skipanchors != 0 ) {
2381 for ( int i = 0; i < (int) b.ls.size(); i++ ) {
2382 int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
2383 skipanchors );
2384 lanchors.insert( b.ls[i], a );
2385 }
2386 }
2387 mergeInto( &ls, b.ls );
2388 }
2389 if ( b.minl == 0 ) {
2390 mergeInto( &ranchors, b.ranchors );
2391 if ( b.skipanchors != 0 ) {
2392 for ( int i = 0; i < (int) rs.size(); i++ ) {
2393 int a = eng->anchorConcatenation( at(ranchors, rs[i]),
2394 b.skipanchors );
2395 ranchors.insert( rs[i], a );
2396 }
2397 }
2398 mergeInto( &rs, b.rs );
2399 } else {
2400 ranchors = b.ranchors;
2401 rs = b.rs;
2402 }
2403
2404#ifndef QT_NO_REGEXP_OPTIM
2405 if ( maxl != InftyLen ) {
2406 if ( rightStr.length() + b.leftStr.length() >
2407 QMAX(str.length(), b.str.length()) ) {
2408 earlyStart = minl - rightStr.length();
2409 lateStart = maxl - rightStr.length();
2410 str = rightStr + b.leftStr;
2411 } else if ( b.str.length() > str.length() ) {
2412 earlyStart = minl + b.earlyStart;
2413 lateStart = maxl + b.lateStart;
2414 str = b.str;
2415 }
2416 }
2417
2418 if ( (int) leftStr.length() == maxl )
2419 leftStr += b.leftStr;
2420 if ( (int) b.rightStr.length() == b.maxl )
2421 rightStr += b.rightStr;
2422 else
2423 rightStr = b.rightStr;
2424
2425 if ( maxl == InftyLen || b.maxl == InftyLen )
2426 maxl = InftyLen;
2427 else
2428 maxl += b.maxl;
2429
2430 occ1.detach();
2431 for ( int i = 0; i < NumBadChars; i++ ) {
2432 if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
2433 occ1[i] = minl + b.occ1[i];
2434 }
2435#endif
2436
2437 minl += b.minl;
2438 if ( minl == 0 )
2439 skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
2440 else
2441 skipanchors = 0;
2442}
2443
2444void QRegExpEngine::Box::orx( const Box& b )
2445{
2446 mergeInto( &ls, b.ls );
2447 mergeInto( &lanchors, b.lanchors );
2448 mergeInto( &rs, b.rs );
2449 mergeInto( &ranchors, b.ranchors );
2450
2451 if ( b.minl == 0 ) {
2452 if ( minl == 0 )
2453 skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
2454 else
2455 skipanchors = b.skipanchors;
2456 }
2457
2458#ifndef QT_NO_REGEXP_OPTIM
2459 occ1.detach();
2460 for ( int i = 0; i < NumBadChars; i++ ) {
2461 if ( occ1[i] > b.occ1[i] )
2462 occ1[i] = b.occ1[i];
2463 }
2464 earlyStart = 0;
2465 lateStart = 0;
2466 str = QString();
2467 leftStr = QString();
2468 rightStr = QString();
2469 if ( b.maxl > maxl )
2470 maxl = b.maxl;
2471#endif
2472 if ( b.minl < minl )
2473 minl = b.minl;
2474}
2475
2476void QRegExpEngine::Box::plus( int atom )
2477{
2478#ifndef QT_NO_REGEXP_CAPTURE
2479 eng->addPlusTransitions( rs, ls, atom );
2480#else
2481 Q_UNUSED( atom );
2482 eng->addCatTransitions( rs, ls );
2483#endif
2484 addAnchorsToEngine( *this );
2485#ifndef QT_NO_REGEXP_OPTIM
2486 maxl = InftyLen;
2487#endif
2488}
2489
2490void QRegExpEngine::Box::opt()
2491{
2492#ifndef QT_NO_REGEXP_OPTIM
2493 earlyStart = 0;
2494 lateStart = 0;
2495 str = QString();
2496 leftStr = QString();
2497 rightStr = QString();
2498#endif
2499 skipanchors = 0;
2500 minl = 0;
2501}
2502
2503void QRegExpEngine::Box::catAnchor( int a )
2504{
2505 if ( a != 0 ) {
2506 for ( int i = 0; i < (int) rs.size(); i++ ) {
2507 a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
2508 ranchors.insert( rs[i], a );
2509 }
2510 if ( minl == 0 )
2511 skipanchors = eng->anchorConcatenation( skipanchors, a );
2512 }
2513}
2514
2515#ifndef QT_NO_REGEXP_OPTIM
2516void QRegExpEngine::Box::setupHeuristics()
2517{
2518 eng->setupGoodStringHeuristic( earlyStart, lateStart, str );
2519
2520 /*
2521 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2522 1 at this point. An entry of occ1 has to be at most minl or
2523 infinity for the rest of the algorithm to go well.
2524
2525 We waited until here before normalizing these cases (instead of
2526 doing it in Box::orx()) because sometimes things improve by
2527 themselves. Consider for example (112|1)34.
2528 */
2529 for ( int i = 0; i < NumBadChars; i++ ) {
2530 if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
2531 occ1[i] = minl;
2532 }
2533 eng->setupBadCharHeuristic( minl, occ1 );
2534
2535 eng->heuristicallyChooseHeuristic();
2536}
2537#endif
2538
2539#if defined(QT_DEBUG)
2540void QRegExpEngine::Box::dump() const
2541{
2542 int i;
2543 qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
2544 qDebug( " Left states:" );
2545 for ( i = 0; i < (int) ls.size(); i++ ) {
2546 if ( at(lanchors, ls[i]) == 0 )
2547 qDebug( " %d", ls[i] );
2548 else
2549 qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
2550 }
2551 qDebug( " Right states:" );
2552 for ( i = 0; i < (int) rs.size(); i++ ) {
2553 if ( at(ranchors, rs[i]) == 0 )
2554 qDebug( " %d", rs[i] );
2555 else
2556 qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
2557 }
2558 qDebug( " Skip anchors: 0x%.8x", skipanchors );
2559}
2560#endif
2561
2562void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
2563{
2564 for ( int i = 0; i < (int) to.ls.size(); i++ ) {
2565 for ( int j = 0; j < (int) rs.size(); j++ ) {
2566 int a = eng->anchorConcatenation( at(ranchors, rs[j]),
2567 at(to.lanchors, to.ls[i]) );
2568 eng->addAnchors( rs[j], to.ls[i], a );
2569 }
2570 }
2571}
2572
2573int QRegExpEngine::getChar()
2574{
2575 return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
2576}
2577
2578int QRegExpEngine::getEscape()
2579{
2580#ifndef QT_NO_REGEXP_ESCAPE
2581 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2582 const char backTab[] = "\a\f\n\r\t\v";
2583 ushort low;
2584 int i;
2585#endif
2586 ushort val;
2587 int prevCh = yyCh;
2588
2589 if ( prevCh == EOS ) {
2590 error( RXERR_END );
2591 return Tok_Char | '\\';
2592 }
2593 yyCh = getChar();
2594#ifndef QT_NO_REGEXP_ESCAPE
2595 if ( (prevCh & ~0xff) == 0 ) {
2596 const char *p = strchr( tab, prevCh );
2597 if ( p != 0 )
2598 return Tok_Char | backTab[p - tab];
2599 }
2600#endif
2601
2602 switch ( prevCh ) {
2603#ifndef QT_NO_REGEXP_ESCAPE
2604 case '0':
2605 val = 0;
2606 for ( i = 0; i < 3; i++ ) {
2607 if ( yyCh >= '0' && yyCh <= '7' )
2608 val = ( val << 3 ) | ( yyCh - '0' );
2609 else
2610 break;
2611 yyCh = getChar();
2612 }
2613 if ( (val & ~0377) != 0 )
2614 error( RXERR_OCTAL );
2615 return Tok_Char | val;
2616#endif
2617#ifndef QT_NO_REGEXP_ESCAPE
2618 case 'B':
2619 return Tok_NonWord;
2620#endif
2621#ifndef QT_NO_REGEXP_CCLASS
2622 case 'D':
2623 // see QChar::isDigit()
2624 yyCharClass->addCategories( 0x7fffffef );
2625 return Tok_CharClass;
2626 case 'S':
2627 // see QChar::isSpace()
2628 yyCharClass->addCategories( 0x7ffff87f );
2629 yyCharClass->addRange( 0x0000, 0x0008 );
2630 yyCharClass->addRange( 0x000e, 0x001f );
2631 yyCharClass->addRange( 0x007f, 0x009f );
2632 return Tok_CharClass;
2633 case 'W':
2634 // see QChar::isLetterOrNumber()
2635 yyCharClass->addCategories( 0x7ff07f8f );
2636 return Tok_CharClass;
2637#endif
2638#ifndef QT_NO_REGEXP_ESCAPE
2639 case 'b':
2640 return Tok_Word;
2641#endif
2642#ifndef QT_NO_REGEXP_CCLASS
2643 case 'd':
2644 // see QChar::isDigit()
2645 yyCharClass->addCategories( 0x00000010 );
2646 return Tok_CharClass;
2647 case 's':
2648 // see QChar::isSpace()
2649 yyCharClass->addCategories( 0x00000380 );
2650 yyCharClass->addRange( 0x0009, 0x000d );
2651 return Tok_CharClass;
2652 case 'w':
2653 // see QChar::isLetterOrNumber()
2654 yyCharClass->addCategories( 0x000f8070 );
2655 return Tok_CharClass;
2656#endif
2657#ifndef QT_NO_REGEXP_ESCAPE
2658 case 'x':
2659 val = 0;
2660 for ( i = 0; i < 4; i++ ) {
2661 low = QChar( yyCh ).lower();
2662 if ( low >= '0' && low <= '9' )
2663 val = ( val << 4 ) | ( low - '0' );
2664 else if ( low >= 'a' && low <= 'f' )
2665 val = ( val << 4 ) | ( low - 'a' + 10 );
2666 else
2667 break;
2668 yyCh = getChar();
2669 }
2670 return Tok_Char | val;
2671#endif
2672 default:
2673 if ( prevCh >= '1' && prevCh <= '9' ) {
2674#ifndef QT_NO_REGEXP_BACKREF
2675 val = prevCh - '0';
2676 while ( yyCh >= '0' && yyCh <= '9' ) {
2677 val = ( val *= 10 ) | ( yyCh - '0' );
2678 yyCh = getChar();
2679 }
2680 return Tok_BackRef | val;
2681#else
2682 error( RXERR_DISABLED );
2683#endif
2684 }
2685 return Tok_Char | prevCh;
2686 }
2687}
2688
2689#ifndef QT_NO_REGEXP_INTERVAL
2690int QRegExpEngine::getRep( int def )
2691{
2692 if ( yyCh >= '0' && yyCh <= '9' ) {
2693 int rep = 0;
2694 do {
2695 rep = 10 * rep + yyCh - '0';
2696 if ( rep >= InftyRep ) {
2697 error( RXERR_REPETITION );
2698 rep = def;
2699 }
2700 yyCh = getChar();
2701 } while ( yyCh >= '0' && yyCh <= '9' );
2702 return rep;
2703 } else {
2704 return def;
2705 }
2706}
2707#endif
2708
2709#ifndef QT_NO_REGEXP_LOOKAHEAD
2710void QRegExpEngine::skipChars( int n )
2711{
2712 if ( n > 0 ) {
2713 yyPos += n - 1;
2714 yyCh = getChar();
2715 }
2716}
2717#endif
2718
2719void QRegExpEngine::error( const char *msg )
2720{
2721 if ( yyError.isEmpty() )
2722 yyError = QString::fromLatin1( msg );
2723}
2724
2725void QRegExpEngine::startTokenizer( const QChar *rx, int len )
2726{
2727 yyIn = rx;
2728 yyPos0 = 0;
2729 yyPos = 0;
2730 yyLen = len;
2731 yyCh = getChar();
2732 yyCharClass = new CharClass;
2733 yyMinRep = 0;
2734 yyMaxRep = 0;
2735 yyError = QString();
2736}
2737
2738int QRegExpEngine::getToken()
2739{
2740#ifndef QT_NO_REGEXP_CCLASS
2741 ushort pendingCh = 0;
2742 bool charPending;
2743 bool rangePending;
2744 int tok;
2745#endif
2746 int prevCh = yyCh;
2747
2748 yyPos0 = yyPos - 1;
2749#ifndef QT_NO_REGEXP_CCLASS
2750 yyCharClass->clear();
2751#endif
2752 yyMinRep = 0;
2753 yyMaxRep = 0;
2754 yyCh = getChar();
2755 switch ( prevCh ) {
2756 case EOS:
2757 yyPos0 = yyPos;
2758 return Tok_Eos;
2759 case '$':
2760 return Tok_Dollar;
2761 case '(':
2762 if ( yyCh == '?' ) {
2763 prevCh = getChar();
2764 yyCh = getChar();
2765 switch ( prevCh ) {
2766#ifndef QT_NO_REGEXP_LOOKAHEAD
2767 case '!':
2768 return Tok_NegLookahead;
2769 case '=':
2770 return Tok_PosLookahead;
2771#endif
2772 case ':':
2773 return Tok_MagicLeftParen;
2774 default:
2775 error( RXERR_LOOKAHEAD );
2776 return Tok_MagicLeftParen;
2777 }
2778 } else {
2779 return Tok_LeftParen;
2780 }
2781 case ')':
2782 return Tok_RightParen;
2783 case '*':
2784 yyMinRep = 0;
2785 yyMaxRep = InftyRep;
2786 return Tok_Quantifier;
2787 case '+':
2788 yyMinRep = 1;
2789 yyMaxRep = InftyRep;
2790 return Tok_Quantifier;
2791 case '.':
2792#ifndef QT_NO_REGEXP_CCLASS
2793 yyCharClass->setNegative( TRUE );
2794#endif
2795 return Tok_CharClass;
2796 case '?':
2797 yyMinRep = 0;
2798 yyMaxRep = 1;
2799 return Tok_Quantifier;
2800 case '[':
2801#ifndef QT_NO_REGEXP_CCLASS
2802 if ( yyCh == '^' ) {
2803 yyCharClass->setNegative( TRUE );
2804 yyCh = getChar();
2805 }
2806 charPending = FALSE;
2807 rangePending = FALSE;
2808 do {
2809 if ( yyCh == '-' && charPending && !rangePending ) {
2810 rangePending = TRUE;
2811 yyCh = getChar();
2812 } else {
2813 if ( charPending && !rangePending ) {
2814 yyCharClass->addSingleton( pendingCh );
2815 charPending = FALSE;
2816 }
2817 if ( yyCh == '\\' ) {
2818 yyCh = getChar();
2819 tok = getEscape();
2820 if ( tok == Tok_Word )
2821 tok = '\b';
2822 } else {
2823 tok = Tok_Char | yyCh;
2824 yyCh = getChar();
2825 }
2826 if ( tok == Tok_CharClass ) {
2827 if ( rangePending ) {
2828 yyCharClass->addSingleton( '-' );
2829 yyCharClass->addSingleton( pendingCh );
2830 charPending = FALSE;
2831 rangePending = FALSE;
2832 }
2833 } else if ( (tok & Tok_Char) != 0 ) {
2834 if ( rangePending ) {
2835 yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
2836 charPending = FALSE;
2837 rangePending = FALSE;
2838 } else {
2839 pendingCh = tok ^ Tok_Char;
2840 charPending = TRUE;
2841 }
2842 } else {
2843 error( RXERR_CHARCLASS );
2844 }
2845 }
2846 } while ( yyCh != ']' && yyCh != EOS );
2847 if ( rangePending )
2848 yyCharClass->addSingleton( '-' );
2849 if ( charPending )
2850 yyCharClass->addSingleton( pendingCh );
2851 if ( yyCh == EOS )
2852 error( RXERR_END );
2853 else
2854 yyCh = getChar();
2855 return Tok_CharClass;
2856#else
2857 error( RXERR_END );
2858 return Tok_Char | '[';
2859#endif
2860 case '\\':
2861 return getEscape();
2862 case ']':
2863 error( RXERR_LEFTDELIM );
2864 return Tok_Char | ']';
2865 case '^':
2866 return Tok_Caret;
2867 case '{':
2868#ifndef QT_NO_REGEXP_INTERVAL
2869 yyMinRep = getRep( 0 );
2870 yyMaxRep = yyMinRep;
2871 if ( yyCh == ',' ) {
2872 yyCh = getChar();
2873 yyMaxRep = getRep( InftyRep );
2874 }
2875 if ( yyMaxRep < yyMinRep )
2876 qSwap( yyMinRep, yyMaxRep );
2877 if ( yyCh != '}' )
2878 error( RXERR_REPETITION );
2879 yyCh = getChar();
2880 return Tok_Quantifier;
2881#else
2882 error( RXERR_DISABLED );
2883 return Tok_Char | '{';
2884#endif
2885 case '|':
2886 return Tok_Bar;
2887 case '}':
2888 error( RXERR_LEFTDELIM );
2889 return Tok_Char | '}';
2890 default:
2891 return Tok_Char | prevCh;
2892 }
2893}
2894
2895int QRegExpEngine::parse( const QChar *pattern, int len )
2896{
2897 valid = TRUE;
2898 startTokenizer( pattern, len );
2899 yyTok = getToken();
2900#ifndef QT_NO_REGEXP_CAPTURE
2901 yyMayCapture = TRUE;
2902#else
2903 yyMayCapture = FALSE;
2904#endif
2905
2906#ifndef QT_NO_REGEXP_CAPTURE
2907 int atom = startAtom( FALSE );
2908#endif
2909 CharClass anything;
2910 Box box( this ); // create InitialState
2911 box.set( anything );
2912 Box rightBox( this ); // create FinalState
2913 rightBox.set( anything );
2914
2915 Box middleBox( this );
2916 parseExpression( &middleBox );
2917#ifndef QT_NO_REGEXP_CAPTURE
2918 finishAtom( atom );
2919#endif
2920#ifndef QT_NO_REGEXP_OPTIM
2921 middleBox.setupHeuristics();
2922#endif
2923 box.cat( middleBox );
2924 box.cat( rightBox );
2925 delete yyCharClass;
2926 yyCharClass = 0;
2927
2928 officialncap = ncap;
2929#ifndef QT_NO_REGEXP_BACKREF
2930 if ( nbrefs > ncap )
2931 ncap = nbrefs;
2932#endif
2933
2934 mmCaptured.resize( 2 + 2 * officialncap );
2935 mmCapturedNoMatch.fill( -1, 2 + 2 * officialncap );
2936
2937 /*
2938 We use one QMemArray<int> for all the big data used a lot in
2939 matchHere() and friends.
2940 */
2941#ifndef QT_NO_REGEXP_OPTIM
2942 mmSlideTabSize = QMAX( minl + 1, 16 );
2943#else
2944 mmSlideTabSize = 0;
2945#endif
2946 mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
2947
2948 mmInNextStack = mmBigArray.data();
2949 memset( mmInNextStack, -1, ns * sizeof(int) );
2950 mmCurStack = mmInNextStack + ns;
2951 mmNextStack = mmInNextStack + 2 * ns;
2952
2953 mmCurCapBegin = mmInNextStack + 3 * ns;
2954 mmNextCapBegin = mmCurCapBegin + ncap * ns;
2955 mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
2956 mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
2957
2958 mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
2959 mmTempCapEnd = mmTempCapBegin + ncap;
2960 mmCapBegin = mmTempCapBegin + 2 * ncap;
2961 mmCapEnd = mmTempCapBegin + 3 * ncap;
2962
2963 mmSlideTab = mmTempCapBegin + 4 * ncap;
2964
2965 if ( !yyError.isEmpty() )
2966 return -1;
2967
2968#ifndef QT_NO_REGEXP_OPTIM
2969 State *sinit = s[InitialState];
2970 caretAnchored = ( sinit->anchors != 0 );
2971 if ( caretAnchored ) {
2972 QMap<int, int>& anchors = *sinit->anchors;
2973 QMap<int, int>::ConstIterator a;
2974 for ( a = anchors.begin(); a != anchors.end(); ++a ) {
2975#ifndef QT_NO_REGEXP_ANCHOR_ALT
2976 if ( (*a & Anchor_Alternation) != 0 )
2977 break;
2978#endif
2979 if ( (*a & Anchor_Caret) == 0 ) {
2980 caretAnchored = FALSE;
2981 break;
2982 }
2983 }
2984 }
2985#endif
2986 return yyPos0;
2987}
2988
2989void QRegExpEngine::parseAtom( Box *box )
2990{
2991#ifndef QT_NO_REGEXP_LOOKAHEAD
2992 QRegExpEngine *eng = 0;
2993 bool neg;
2994 int len;
2995#endif
2996
2997 switch ( yyTok ) {
2998 case Tok_Dollar:
2999 box->catAnchor( Anchor_Dollar );
3000 break;
3001 case Tok_Caret:
3002 box->catAnchor( Anchor_Caret );
3003 break;
3004#ifndef QT_NO_REGEXP_LOOKAHEAD
3005 case Tok_PosLookahead:
3006 case Tok_NegLookahead:
3007 neg = ( yyTok == Tok_NegLookahead );
3008 eng = new QRegExpEngine( cs );
3009 len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
3010 if ( len >= 0 )
3011 skipChars( len );
3012 else
3013 error( RXERR_LOOKAHEAD );
3014 box->catAnchor( addLookahead(eng, neg) );
3015 yyTok = getToken();
3016 if ( yyTok != Tok_RightParen )
3017 error( RXERR_LOOKAHEAD );
3018 break;
3019#endif
3020#ifndef QT_NO_REGEXP_ESCAPE
3021 case Tok_Word:
3022 box->catAnchor( Anchor_Word );
3023 break;
3024 case Tok_NonWord:
3025 box->catAnchor( Anchor_NonWord );
3026 break;
3027#endif
3028 case Tok_LeftParen:
3029 case Tok_MagicLeftParen:
3030 yyTok = getToken();
3031 parseExpression( box );
3032 if ( yyTok != Tok_RightParen )
3033 error( RXERR_END );
3034 break;
3035 case Tok_CharClass:
3036 box->set( *yyCharClass );
3037 break;
3038 case Tok_Quantifier:
3039 error( RXERR_REPETITION );
3040 break;
3041 default:
3042 if ( (yyTok & Tok_Char) != 0 )
3043 box->set( QChar(yyTok ^ Tok_Char) );
3044#ifndef QT_NO_REGEXP_BACKREF
3045 else if ( (yyTok & Tok_BackRef) != 0 )
3046 box->set( yyTok ^ Tok_BackRef );
3047#endif
3048 else
3049 error( RXERR_DISABLED );
3050 }
3051 yyTok = getToken();
3052}
3053
3054void QRegExpEngine::parseFactor( Box *box )
3055{
3056#ifndef QT_NO_REGEXP_CAPTURE
3057 int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
3058#else
3059 static const int atom = 0;
3060#endif
3061
3062#ifndef QT_NO_REGEXP_INTERVAL
3063#define YYREDO() \
3064 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3065 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3066
3067 const QChar *in = yyIn;
3068 int pos0 = yyPos0;
3069 int pos = yyPos;
3070 int len = yyLen;
3071 int ch = yyCh;
3072 CharClass charClass;
3073 if ( yyTok == Tok_CharClass )
3074 charClass = *yyCharClass;
3075 int tok = yyTok;
3076 bool mayCapture = yyMayCapture;
3077#endif
3078
3079 parseAtom( box );
3080#ifndef QT_NO_REGEXP_CAPTURE
3081 finishAtom( atom );
3082#endif
3083
3084 if ( yyTok == Tok_Quantifier ) {
3085 if ( yyMaxRep == InftyRep ) {
3086 box->plus( atom );
3087#ifndef QT_NO_REGEXP_INTERVAL
3088 } else if ( yyMaxRep == 0 ) {
3089 box->clear();
3090#endif
3091 }
3092 if ( yyMinRep == 0 )
3093 box->opt();
3094
3095#ifndef QT_NO_REGEXP_INTERVAL
3096 yyMayCapture = FALSE;
3097 int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
3098 int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
3099
3100 Box rightBox( this );
3101 int i;
3102
3103 for ( i = 0; i < beta; i++ ) {
3104 YYREDO();
3105 Box leftBox( this );
3106 parseAtom( &leftBox );
3107 leftBox.cat( rightBox );
3108 leftBox.opt();
3109 rightBox = leftBox;
3110 }
3111 for ( i = 0; i < alpha; i++ ) {
3112 YYREDO();
3113 Box leftBox( this );
3114 parseAtom( &leftBox );
3115 leftBox.cat( rightBox );
3116 rightBox = leftBox;
3117 }
3118 rightBox.cat( *box );
3119 *box = rightBox;
3120#endif
3121 yyTok = getToken();
3122#ifndef QT_NO_REGEXP_INTERVAL
3123 yyMayCapture = mayCapture;
3124#endif
3125 }
3126#undef YYREDO
3127}
3128
3129void QRegExpEngine::parseTerm( Box *box )
3130{
3131#ifndef QT_NO_REGEXP_OPTIM
3132 if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
3133 parseFactor( box );
3134#endif
3135 while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
3136 Box rightBox( this );
3137 parseFactor( &rightBox );
3138 box->cat( rightBox );
3139 }
3140}
3141
3142void QRegExpEngine::parseExpression( Box *box )
3143{
3144 parseTerm( box );
3145 while ( yyTok == Tok_Bar ) {
3146 Box rightBox( this );
3147 yyTok = getToken();
3148 parseTerm( &rightBox );
3149 box->orx( rightBox );
3150 }
3151}
3152
3153/*
3154 The struct QRegExpPrivate contains the private data of a regular
3155 expression other than the automaton. It makes it possible for many
3156 QRegExp objects to use the same QRegExpEngine object with different
3157 QRegExpPrivate objects.
3158*/
3159struct QRegExpPrivate
3160{
3161 QString pattern; // regular-expression or wildcard pattern
3162 QString rxpattern; // regular-expression pattern
3163#ifndef QT_NO_REGEXP_WILDCARD
3164 bool wc; // wildcard mode?
3165#endif
3166 bool min; // minimal matching? (instead of maximal)
3167#ifndef QT_NO_REGEXP_CAPTURE
3168 QString t; // last string passed to QRegExp::search() or searchRev()
3169 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3170#endif
3171 QMemArray<int> captured; // what QRegExpEngine::search() returned last
3172
3173 QRegExpPrivate() { captured.fill( -1, 2 ); }
3174};
3175
3176#ifndef QT_NO_REGEXP_OPTIM
3177static QCache<QRegExpEngine> *engineCache = 0;
3178static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
3179#endif
3180
3181static QRegExpEngine *newEngine( const QString& pattern, bool caseSensitive )
3182{
3183#ifndef QT_NO_REGEXP_OPTIM
3184 if ( engineCache != 0 ) {
3185#ifdef QT_THREAD_SUPPORT
3186 QMutexLocker locker( qt_global_mutexpool->get( &engineCache ) );
3187#endif
3188 QRegExpEngine *eng = engineCache->take( pattern );
3189 if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
3190 delete eng;
3191 } else {
3192 eng->ref();
3193 return eng;
3194 }
3195 }
3196#endif
3197 return new QRegExpEngine( pattern, caseSensitive );
3198}
3199
3200static void derefEngine( QRegExpEngine *eng, const QString& pattern )
3201{
3202 if ( eng != 0 && eng->deref() ) {
3203#ifndef QT_NO_REGEXP_OPTIM
3204#ifdef QT_THREAD_SUPPORT
3205 QMutexLocker locker( qt_global_mutexpool->get( &engineCache ) );
3206#endif
3207 if ( engineCache == 0 ) {
3208 engineCache = new QCache<QRegExpEngine>;
3209 engineCache->setAutoDelete( TRUE );
3210 cleanup_cache.set( &engineCache );
3211 }
3212 if ( !pattern.isNull() &&
3213 engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
3214 return;
3215#else
3216 Q_UNUSED( pattern );
3217#endif
3218 delete eng;
3219 }
3220}
3221
3222/*!
3223 \enum QRegExp::CaretMode
3224
3225 The CaretMode enum defines the different meanings of the caret
3226 (<b>^</b>) in a regular expression. The possible values are:
3227
3228 \value CaretAtZero
3229 The caret corresponds to index 0 in the searched string.
3230
3231 \value CaretAtOffset
3232 The caret corresponds to the start offset of the search.
3233
3234 \value CaretWontMatch
3235 The caret never matches.
3236*/
3237
3238/*!
3239 Constructs an empty regexp.
3240
3241 \sa isValid() errorString()
3242*/
3243QRegExp::QRegExp()
3244{
3245 eng = new QRegExpEngine( TRUE );
3246 priv = new QRegExpPrivate;
3247#ifndef QT_NO_REGEXP_WILDCARD
3248 priv->wc = FALSE;
3249#endif
3250 priv->min = FALSE;
3251 compile( TRUE );
3252}
3253
3254/*!
3255 Constructs a regular expression object for the given \a pattern
3256 string. The pattern must be given using wildcard notation if \a
3257 wildcard is TRUE (default is FALSE). The pattern is case
3258 sensitive, unless \a caseSensitive is FALSE. Matching is greedy
3259 (maximal), but can be changed by calling setMinimal().
3260
3261 \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
3262*/
3263QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
3264{
3265 eng = 0;
3266 priv = new QRegExpPrivate;
3267 priv->pattern = pattern;
3268#ifndef QT_NO_REGEXP_WILDCARD
3269 priv->wc = wildcard;
3270#endif
3271 priv->min = FALSE;
3272 compile( caseSensitive );
3273}
3274
3275/*!
3276 Constructs a regular expression as a copy of \a rx.
3277
3278 \sa operator=()
3279*/
3280QRegExp::QRegExp( const QRegExp& rx )
3281{
3282 eng = 0;
3283 priv = new QRegExpPrivate;
3284 operator=( rx );
3285}
3286
3287/*!
3288 Destroys the regular expression and cleans up its internal data.
3289*/
3290QRegExp::~QRegExp()
3291{
3292 derefEngine( eng, priv->rxpattern );
3293 delete priv;
3294}
3295
3296/*!
3297 Copies the regular expression \a rx and returns a reference to the
3298 copy. The case sensitivity, wildcard and minimal matching options
3299 are also copied.
3300*/
3301QRegExp& QRegExp::operator=( const QRegExp& rx )
3302{
3303 rx.eng->ref();
3304 derefEngine( eng, priv->rxpattern );
3305 eng = rx.eng;
3306 priv->pattern = rx.priv->pattern;
3307 priv->rxpattern = rx.priv->rxpattern;
3308#ifndef QT_NO_REGEXP_WILDCARD
3309 priv->wc = rx.priv->wc;
3310#endif
3311 priv->min = rx.priv->min;
3312#ifndef QT_NO_REGEXP_CAPTURE
3313 priv->t = rx.priv->t;
3314 priv->capturedCache = rx.priv->capturedCache;
3315#endif
3316 priv->captured = rx.priv->captured;
3317 return *this;
3318}
3319
3320/*!
3321 Returns TRUE if this regular expression is equal to \a rx;
3322 otherwise returns FALSE.
3323
3324 Two QRegExp objects are equal if they have the same pattern
3325 strings and the same settings for case sensitivity, wildcard and
3326 minimal matching.
3327*/
3328bool QRegExp::operator==( const QRegExp& rx ) const
3329{
3330 return priv->pattern == rx.priv->pattern &&
3331 eng->caseSensitive() == rx.eng->caseSensitive() &&
3332#ifndef QT_NO_REGEXP_WILDCARD
3333 priv->wc == rx.priv->wc &&
3334#endif
3335 priv->min == rx.priv->min;
3336}
3337
3338/*!
3339 \fn bool QRegExp::operator!=( const QRegExp& rx ) const
3340
3341 Returns TRUE if this regular expression is not equal to \a rx;
3342 otherwise returns FALSE.
3343
3344 \sa operator==()
3345*/
3346
3347/*!
3348 Returns TRUE if the pattern string is empty; otherwise returns
3349 FALSE.
3350
3351 If you call exactMatch() with an empty pattern on an empty string
3352 it will return TRUE; otherwise it returns FALSE since it operates
3353 over the whole string. If you call search() with an empty pattern
3354 on \e any string it will return the start offset (0 by default)
3355 because the empty pattern matches the 'emptiness' at the start of
3356 the string. In this case the length of the match returned by
3357 matchedLength() will be 0.
3358
3359 See QString::isEmpty().
3360*/
3361
3362bool QRegExp::isEmpty() const
3363{
3364 return priv->pattern.isEmpty();
3365}
3366
3367/*!
3368 Returns TRUE if the regular expression is valid; otherwise returns
3369 FALSE. An invalid regular expression never matches.
3370
3371 The pattern <b>[a-z</b> is an example of an invalid pattern, since
3372 it lacks a closing square bracket.
3373
3374 Note that the validity of a regexp may also depend on the setting
3375 of the wildcard flag, for example <b>*.html</b> is a valid
3376 wildcard regexp but an invalid full regexp.
3377
3378 \sa errorString()
3379*/
3380bool QRegExp::isValid() const
3381{
3382 return eng->isValid();
3383}
3384
3385/*!
3386 Returns the pattern string of the regular expression. The pattern
3387 has either regular expression syntax or wildcard syntax, depending
3388 on wildcard().
3389
3390 \sa setPattern()
3391*/
3392QString QRegExp::pattern() const
3393{
3394 return priv->pattern;
3395}
3396
3397/*!
3398 Sets the pattern string to \a pattern. The case sensitivity,
3399 wildcard and minimal matching options are not changed.
3400
3401 \sa pattern()
3402*/
3403void QRegExp::setPattern( const QString& pattern )
3404{
3405 if ( priv->pattern != pattern ) {
3406 priv->pattern = pattern;
3407 compile( caseSensitive() );
3408 }
3409}
3410
3411/*!
3412 Returns TRUE if case sensitivity is enabled; otherwise returns
3413 FALSE. The default is TRUE.
3414
3415 \sa setCaseSensitive()
3416*/
3417bool QRegExp::caseSensitive() const
3418{
3419 return eng->caseSensitive();
3420}
3421
3422/*!
3423 Sets case sensitive matching to \a sensitive.
3424
3425 If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
3426 not \c{README.TXT}.
3427
3428 \sa caseSensitive()
3429*/
3430void QRegExp::setCaseSensitive( bool sensitive )
3431{
3432 if ( sensitive != eng->caseSensitive() )
3433 compile( sensitive );
3434}
3435
3436#ifndef QT_NO_REGEXP_WILDCARD
3437/*!
3438 Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
3439 The default is FALSE.
3440
3441 \sa setWildcard()
3442*/
3443bool QRegExp::wildcard() const
3444{
3445 return priv->wc;
3446}
3447
3448/*!
3449 Sets the wildcard mode for the regular expression. The default is
3450 FALSE.
3451
3452 Setting \a wildcard to TRUE enables simple shell-like wildcard
3453 matching. (See \link #wildcard-matching wildcard matching
3454 (globbing) \endlink.)
3455
3456 For example, <b>r*.txt</b> matches the string \c{readme.txt} in
3457 wildcard mode, but does not match \c{readme}.
3458
3459 \sa wildcard()
3460*/
3461void QRegExp::setWildcard( bool wildcard )
3462{
3463 if ( wildcard != priv->wc ) {
3464 priv->wc = wildcard;
3465 compile( caseSensitive() );
3466 }
3467}
3468#endif
3469
3470/*!
3471 Returns TRUE if minimal (non-greedy) matching is enabled;
3472 otherwise returns FALSE.
3473
3474 \sa setMinimal()
3475*/
3476bool QRegExp::minimal() const
3477{
3478 return priv->min;
3479}
3480
3481/*!
3482 Enables or disables minimal matching. If \a minimal is FALSE,
3483 matching is greedy (maximal) which is the default.
3484
3485 For example, suppose we have the input string "We must be
3486 \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
3487 <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
3488 the match is "We must be <u>\<b>bold\</b>, very
3489 \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
3490 first match is: "We must be <u>\<b>bold\</b></u>, very
3491 \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
3492 very <u>\<b>bold\</b></u>!". In practice we might use the pattern
3493 <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
3494 nested tags.
3495
3496 \sa minimal()
3497*/
3498void QRegExp::setMinimal( bool minimal )
3499{
3500 priv->min = minimal;
3501}
3502
3503/*!
3504 Returns TRUE if \a str is matched exactly by this regular
3505 expression; otherwise returns FALSE. You can determine how much of
3506 the string was matched by calling matchedLength().
3507
3508 For a given regexp string, R, exactMatch("R") is the equivalent of
3509 search("^R$") since exactMatch() effectively encloses the regexp
3510 in the start of string and end of string anchors, except that it
3511 sets matchedLength() differently.
3512
3513 For example, if the regular expression is <b>blue</b>, then
3514 exactMatch() returns TRUE only for input \c blue. For inputs \c
3515 bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
3516 and matchedLength() will return 4, 3 and 0 respectively.
3517
3518 Although const, this function sets matchedLength(),
3519 capturedTexts() and pos().
3520
3521 \sa search() searchRev() QRegExpValidator
3522*/
3523bool QRegExp::exactMatch( const QString& str ) const
3524{
3525#ifndef QT_NO_REGEXP_CAPTURE
3526 priv->t = str;
3527 priv->capturedCache.clear();
3528#endif
3529
3530 priv->captured = eng->match( str, 0, priv->min, TRUE, 0 );
3531 if ( priv->captured[1] == (int) str.length() ) {
3532 return TRUE;
3533 } else {
3534 priv->captured.detach();
3535 priv->captured[0] = 0;
3536 priv->captured[1] = eng->matchedLength();
3537 return FALSE;
3538 }
3539}
3540
3541#ifndef QT_NO_COMPAT
3542/*! \obsolete
3543
3544 Attempts to match in \a str, starting from position \a index.
3545 Returns the position of the match, or -1 if there was no match.
3546
3547 The length of the match is stored in \a *len, unless \a len is a
3548 null pointer.
3549
3550 If \a indexIsStart is TRUE (the default), the position \a index in
3551 the string will match the start of string anchor, <b>^</b>, in the
3552 regexp, if present. Otherwise, position 0 in \a str will match.
3553
3554 Use search() and matchedLength() instead of this function.
3555
3556 \sa QString::mid() QConstString
3557*/
3558int QRegExp::match( const QString& str, int index, int *len,
3559 bool indexIsStart ) const
3560{
3561 int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
3562 if ( len != 0 )
3563 *len = matchedLength();
3564 return pos;
3565}
3566#endif // QT_NO_COMPAT
3567
3568/*!
3569 \overload
3570
3571 This convenience function searches with a \c CaretMode of \c
3572 CaretAtZero which is the most common usage.
3573*/
3574
3575int QRegExp::search( const QString& str, int offset ) const
3576{
3577 return search( str, offset, CaretAtZero );
3578}
3579
3580/*!
3581 Attempts to find a match in \a str from position \a offset (0 by
3582 default). If \a offset is -1, the search starts at the last
3583 character; if -2, at the next to last character; etc.
3584
3585 Returns the position of the first match, or -1 if there was no
3586 match.
3587
3588 The \a caretMode parameter can be used to instruct whether <b>^</b>
3589 should match at index 0 or at \a offset.
3590
3591 You might prefer to use QString::find(), QString::contains() or
3592 even QStringList::grep(). To replace matches use
3593 QString::replace().
3594
3595 Example:
3596 \code
3597 QString str = "offsets: 1.23 .50 71.00 6.00";
3598 QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
3599 int count = 0;
3600 int pos = 0;
3601 while ( (pos = rx.search(str, pos)) != -1 ) {
3602 count++;
3603 pos += rx.matchedLength();
3604 }
3605 // pos will be 9, 14, 18 and finally 24; count will end up as 4
3606 \endcode
3607
3608 Although const, this function sets matchedLength(),
3609 capturedTexts() and pos().
3610
3611 \sa searchRev() exactMatch()
3612*/
3613
3614int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
3615{
3616 if ( offset < 0 )
3617 offset += str.length();
3618#ifndef QT_NO_REGEXP_CAPTURE
3619 priv->t = str;
3620 priv->capturedCache.clear();
3621#endif
3622 priv->captured = eng->match( str, offset, priv->min, FALSE,
3623 caretIndex(offset, caretMode) );
3624 return priv->captured[0];
3625}
3626
3627
3628/*!
3629 \overload
3630
3631 This convenience function searches with a \c CaretMode of \c
3632 CaretAtZero which is the most common usage.
3633*/
3634
3635int QRegExp::searchRev( const QString& str, int offset ) const
3636{
3637 return searchRev( str, offset, CaretAtZero );
3638}
3639
3640/*!
3641 Attempts to find a match backwards in \a str from position \a
3642 offset. If \a offset is -1 (the default), the search starts at the
3643 last character; if -2, at the next to last character; etc.
3644
3645 Returns the position of the first match, or -1 if there was no
3646 match.
3647
3648 The \a caretMode parameter can be used to instruct whether <b>^</b>
3649 should match at index 0 or at \a offset.
3650
3651 Although const, this function sets matchedLength(),
3652 capturedTexts() and pos().
3653
3654 \warning Searching backwards is much slower than searching
3655 forwards.
3656
3657 \sa search() exactMatch()
3658*/
3659
3660int QRegExp::searchRev( const QString& str, int offset,
3661 CaretMode caretMode ) const
3662{
3663 if ( offset < 0 )
3664 offset += str.length();
3665#ifndef QT_NO_REGEXP_CAPTURE
3666 priv->t = str;
3667 priv->capturedCache.clear();
3668#endif
3669 if ( offset < 0 || offset > (int) str.length() ) {
3670 priv->captured.detach();
3671 priv->captured.fill( -1 );
3672 return -1;
3673 }
3674
3675 while ( offset >= 0 ) {
3676 priv->captured = eng->match( str, offset, priv->min, TRUE,
3677 caretIndex(offset, caretMode) );
3678 if ( priv->captured[0] == offset )
3679 return offset;
3680 offset--;
3681 }
3682 return -1;
3683}
3684
3685/*!
3686 Returns the length of the last matched string, or -1 if there was
3687 no match.
3688
3689 \sa exactMatch() search() searchRev()
3690*/
3691int QRegExp::matchedLength() const
3692{
3693 return priv->captured[1];
3694}
3695
3696#ifndef QT_NO_REGEXP_CAPTURE
3697/*!
3698 Returns the number of captures contained in the regular expression.
3699 */
3700int QRegExp::numCaptures() const
3701{
3702 return eng->numCaptures();
3703}
3704
3705
3706
3707/*!
3708 Returns a list of the captured text strings.
3709
3710 The first string in the list is the entire matched string. Each
3711 subsequent list element contains a string that matched a
3712 (capturing) subexpression of the regexp.
3713
3714 For example:
3715 \code
3716 QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
3717 int pos = rx.search( "Length: 36 inches" );
3718 QStringList list = rx.capturedTexts();
3719 // list is now ( "36 inches", "36", " ", "inches", "es" )
3720 \endcode
3721
3722 The above example also captures elements that may be present but
3723 which we have no interest in. This problem can be solved by using
3724 non-capturing parentheses:
3725
3726 \code
3727 QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
3728 int pos = rx.search( "Length: 36 inches" );
3729 QStringList list = rx.capturedTexts();
3730 // list is now ( "36 inches", "36", "inches" )
3731 \endcode
3732
3733 Note that if you want to iterate over the list, you should iterate
3734 over a copy, e.g.
3735 \code
3736 QStringList list = rx.capturedTexts();
3737 QStringList::Iterator it = list.begin();
3738 while( it != list.end() ) {
3739 myProcessing( *it );
3740 ++it;
3741 }
3742 \endcode
3743
3744 Some regexps can match an indeterminate number of times. For
3745 example if the input string is "Offsets: 12 14 99 231 7" and the
3746 regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
3747 all the numbers matched. However, after calling
3748 \c{rx.search(str)}, capturedTexts() will return the list ( "12",
3749 "12" ), i.e. the entire match was "12" and the first subexpression
3750 matched was "12". The correct approach is to use cap() in a \link
3751 #cap_in_a_loop loop \endlink.
3752
3753 The order of elements in the string list is as follows. The first
3754 element is the entire matching string. Each subsequent element
3755 corresponds to the next capturing open left parentheses. Thus
3756 capturedTexts()[1] is the text of the first capturing parentheses,
3757 capturedTexts()[2] is the text of the second and so on
3758 (corresponding to $1, $2, etc., in some other regexp languages).
3759
3760 \sa cap() pos() exactMatch() search() searchRev()
3761*/
3762QStringList QRegExp::capturedTexts()
3763{
3764 if ( priv->capturedCache.isEmpty() ) {
3765 for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
3766 QString m;
3767 if ( priv->captured[i + 1] == 0 )
3768 m = QString::fromLatin1( "" );
3769 else if ( priv->captured[i] >= 0 )
3770 m = priv->t.mid( priv->captured[i],
3771 priv->captured[i + 1] );
3772 priv->capturedCache.append( m );
3773 }
3774 priv->t = QString::null;
3775 }
3776 return priv->capturedCache;
3777}
3778
3779/*!
3780 Returns the text captured by the \a nth subexpression. The entire
3781 match has index 0 and the parenthesized subexpressions have
3782 indices starting from 1 (excluding non-capturing parentheses).
3783
3784 \code
3785 QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
3786 int pos = rxlen.search( "Length: 189cm" );
3787 if ( pos > -1 ) {
3788 QString value = rxlen.cap( 1 ); // "189"
3789 QString unit = rxlen.cap( 2 ); // "cm"
3790 // ...
3791 }
3792 \endcode
3793
3794 The order of elements matched by cap() is as follows. The first
3795 element, cap(0), is the entire matching string. Each subsequent
3796 element corresponds to the next capturing open left parentheses.
3797 Thus cap(1) is the text of the first capturing parentheses, cap(2)
3798 is the text of the second, and so on.
3799
3800 \target cap_in_a_loop
3801 Some patterns may lead to a number of matches which cannot be
3802 determined in advance, for example:
3803
3804 \code
3805 QRegExp rx( "(\\d+)" );
3806 str = "Offsets: 12 14 99 231 7";
3807 QStringList list;
3808 pos = 0;
3809 while ( pos >= 0 ) {
3810 pos = rx.search( str, pos );
3811 if ( pos > -1 ) {
3812 list += rx.cap( 1 );
3813 pos += rx.matchedLength();
3814 }
3815 }
3816 // list contains "12", "14", "99", "231", "7"
3817 \endcode
3818
3819 \sa capturedTexts() pos() exactMatch() search() searchRev()
3820*/
3821QString QRegExp::cap( int nth )
3822{
3823 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3824 return QString::null;
3825 else
3826 return capturedTexts()[nth];
3827}
3828
3829/*!
3830 Returns the position of the \a nth captured text in the searched
3831 string. If \a nth is 0 (the default), pos() returns the position
3832 of the whole match.
3833
3834 Example:
3835 \code
3836 QRegExp rx( "/([a-z]+)/([a-z]+)" );
3837 rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
3838 rx.pos( 0 ); // returns 7 (position of /dev/null)
3839 rx.pos( 1 ); // returns 8 (position of dev)
3840 rx.pos( 2 ); // returns 12 (position of null)
3841 \endcode
3842
3843 For zero-length matches, pos() always returns -1. (For example, if
3844 cap(4) would return an empty string, pos(4) returns -1.) This is
3845 due to an implementation tradeoff.
3846
3847 \sa capturedTexts() exactMatch() search() searchRev()
3848*/
3849int QRegExp::pos( int nth )
3850{
3851 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3852 return -1;
3853 else
3854 return priv->captured[2 * nth];
3855}
3856
3857/*!
3858 Returns a text string that explains why a regexp pattern is
3859 invalid the case being; otherwise returns "no error occurred".
3860
3861 \sa isValid()
3862*/
3863QString QRegExp::errorString()
3864{
3865 if ( isValid() ) {
3866 return QString( RXERR_OK );
3867 } else {
3868 return eng->errorString();
3869 }
3870}
3871#endif
3872
3873/*!
3874 Returns the string \a str with every regexp special character
3875 escaped with a backslash. The special characters are $, (, ), *, +,
3876 ., ?, [, \, ], ^, {, | and }.
3877
3878 Example:
3879 \code
3880 s1 = QRegExp::escape( "bingo" ); // s1 == "bingo"
3881 s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
3882 \endcode
3883
3884 This function is useful to construct regexp patterns dynamically:
3885
3886 \code
3887 QRegExp rx( "(" + QRegExp::escape(name) +
3888 "|" + QRegExp::escape(alias) + ")" );
3889 \endcode
3890*/
3891QString QRegExp::escape( const QString& str )
3892{
3893 static const char meta[] = "$()*+.?[\\]^{|}";
3894 QString quoted = str;
3895 int i = 0;
3896
3897 while ( i < (int) quoted.length() ) {
3898 if ( strchr(meta, quoted[i].latin1()) != 0 )
3899 quoted.insert( i++, "\\" );
3900 i++;
3901 }
3902 return quoted;
3903}
3904
3905void QRegExp::compile( bool caseSensitive )
3906{
3907 derefEngine( eng, priv->rxpattern );
3908#ifndef QT_NO_REGEXP_WILDCARD
3909 if ( priv->wc )
3910 priv->rxpattern = wc2rx( priv->pattern );
3911 else
3912#endif
3913 priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
3914 : priv->pattern;
3915 eng = newEngine( priv->rxpattern, caseSensitive );
3916#ifndef QT_NO_REGEXP_CAPTURE
3917 priv->t = QString();
3918 priv->capturedCache.clear();
3919#endif
3920 priv->captured.detach();
3921 priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
3922}
3923
3924int QRegExp::caretIndex( int offset, CaretMode caretMode )
3925{
3926 if ( caretMode == CaretAtZero ) {
3927 return 0;
3928 } else if ( caretMode == CaretAtOffset ) {
3929 return offset;
3930 } else { // CaretWontMatch
3931 return -1;
3932 }
3933}
3934
3935#endif // QT_NO_REGEXP