author | kergoth <kergoth> | 2002-11-01 00:10:42 (UTC) |
---|---|---|
committer | kergoth <kergoth> | 2002-11-01 00:10:42 (UTC) |
commit | 5042e3cf0d3514552769e441f5aad590c8eaf967 (patch) (unidiff) | |
tree | 4a5ea45f3519d981a172ab5275bf38c6fa778dec /qmake/tools/qregexp.cpp | |
parent | 108c1c753e74e989cc13923086996791428c9af4 (diff) | |
download | opie-5042e3cf0d3514552769e441f5aad590c8eaf967.zip opie-5042e3cf0d3514552769e441f5aad590c8eaf967.tar.gz opie-5042e3cf0d3514552769e441f5aad590c8eaf967.tar.bz2 |
Adding qmake in preperation for new build system
-rw-r--r-- | qmake/tools/qregexp.cpp | 3935 |
1 files changed, 3935 insertions, 0 deletions
diff --git a/qmake/tools/qregexp.cpp b/qmake/tools/qregexp.cpp new file mode 100644 index 0000000..500efed --- a/dev/null +++ b/qmake/tools/qregexp.cpp | |||
@@ -0,0 +1,3935 @@ | |||
1 | /**************************************************************************** | ||
2 | ** $Id$ | ||
3 | ** | ||
4 | ** Implementation of QRegExp class | ||
5 | ** | ||
6 | ** Created : 950126 | ||
7 | ** | ||
8 | ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved. | ||
9 | ** | ||
10 | ** This file is part of the tools module of the Qt GUI Toolkit. | ||
11 | ** | ||
12 | ** This file may be distributed under the terms of the Q Public License | ||
13 | ** as defined by Trolltech AS of Norway and appearing in the file | ||
14 | ** LICENSE.QPL included in the packaging of this file. | ||
15 | ** | ||
16 | ** This file may be distributed and/or modified under the terms of the | ||
17 | ** GNU General Public License version 2 as published by the Free Software | ||
18 | ** Foundation and appearing in the file LICENSE.GPL included in the | ||
19 | ** packaging of this file. | ||
20 | ** | ||
21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition | ||
22 | ** licenses may use this file in accordance with the Qt Commercial License | ||
23 | ** Agreement provided with the Software. | ||
24 | ** | ||
25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE | ||
26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | ||
27 | ** | ||
28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for | ||
29 | ** information about Qt Commercial License Agreements. | ||
30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information. | ||
31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. | ||
32 | ** | ||
33 | ** Contact info@trolltech.com if any conditions of this licensing are | ||
34 | ** not clear to you. | ||
35 | ** | ||
36 | **********************************************************************/ | ||
37 | |||
38 | #include "qregexp.h" | ||
39 | |||
40 | #ifndef QT_NO_REGEXP | ||
41 | |||
42 | #include "qmemarray.h" | ||
43 | #include "qbitarray.h" | ||
44 | #include "qcache.h" | ||
45 | #include "qcleanuphandler.h" | ||
46 | #include "qintdict.h" | ||
47 | #include "qmap.h" | ||
48 | #include "qptrvector.h" | ||
49 | #include "qstring.h" | ||
50 | #include "qtl.h" | ||
51 | |||
52 | #ifdef QT_THREAD_SUPPORT | ||
53 | #include "qmutexpool_p.h" | ||
54 | #endif // QT_THREAD_SUPPORT | ||
55 | |||
56 | #undef QT_TRANSLATE_NOOP | ||
57 | #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText | ||
58 | |||
59 | #include <limits.h> | ||
60 | |||
61 | // error strings for the regexp parser | ||
62 | #define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" ) | ||
63 | #define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" ) | ||
64 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" ) | ||
65 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" ) | ||
66 | #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" ) | ||
67 | #define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" ) | ||
68 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" ) | ||
69 | #define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" ) | ||
70 | #define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" ) | ||
71 | |||
72 | /* | ||
73 | WARNING! Be sure to read qregexp.tex before modifying this file. | ||
74 | */ | ||
75 | |||
76 | /*! | ||
77 | \class QRegExp qregexp.h | ||
78 | \reentrant | ||
79 | \brief The QRegExp class provides pattern matching using regular expressions. | ||
80 | |||
81 | \ingroup tools | ||
82 | \ingroup misc | ||
83 | \ingroup shared | ||
84 | \mainclass | ||
85 | \keyword regular expression | ||
86 | |||
87 | Regular expressions, or "regexps", provide a way to find patterns | ||
88 | within text. This is useful in many contexts, for example: | ||
89 | |||
90 | \table | ||
91 | \row \i Validation | ||
92 | \i A regexp can be used to check whether a piece of text | ||
93 | meets some criteria, e.g. is an integer or contains no | ||
94 | whitespace. | ||
95 | \row \i Searching | ||
96 | \i Regexps provide a much more powerful means of searching | ||
97 | text than simple string matching does. For example we can | ||
98 | create a regexp which says "find one of the words 'mail', | ||
99 | 'letter' or 'correspondence' but not any of the words | ||
100 | 'email', 'mailman' 'mailer', 'letterbox' etc." | ||
101 | \row \i Search and Replace | ||
102 | \i A regexp can be used to replace a pattern with a piece of | ||
103 | text, for example replace all occurrences of '&' with | ||
104 | '\&' except where the '&' is already followed by 'amp;'. | ||
105 | \row \i String Splitting | ||
106 | \i A regexp can be used to identify where a string should be | ||
107 | split into its component fields, e.g. splitting tab-delimited | ||
108 | strings. | ||
109 | \endtable | ||
110 | |||
111 | We present a very brief introduction to regexps, a description of | ||
112 | Qt's regexp language, some code examples, and finally the function | ||
113 | documentation itself. QRegExp is modeled on Perl's regexp | ||
114 | language, and also fully supports Unicode. QRegExp can also be | ||
115 | used in the weaker 'wildcard' (globbing) mode which works in a | ||
116 | similar way to command shells. A good text on regexps is \e | ||
117 | {Mastering Regular Expressions: Powerful Techniques for Perl and | ||
118 | Other Tools} by Jeffrey E. Friedl, ISBN 1565922573. | ||
119 | |||
120 | Experienced regexp users may prefer to skip the introduction and | ||
121 | go directly to the relevant information. | ||
122 | |||
123 | \tableofcontents | ||
124 | |||
125 | \section1 Introduction | ||
126 | |||
127 | Regexps are built up from expressions, quantifiers, and assertions. | ||
128 | The simplest form of expression is simply a character, e.g. | ||
129 | <b>x</b> or <b>5</b>. An expression can also be a set of | ||
130 | characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or | ||
131 | a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could | ||
132 | write this as <b>[A-D]</b>. If we want to match any of the | ||
133 | captital letters in the English alphabet we can write | ||
134 | <b>[A-Z]</b>. A quantifier tells the regexp engine how many | ||
135 | occurrences of the expression we want, e.g. <b>x{1,1}</b> means | ||
136 | match an <b>x</b> which occurs at least once and at most once. | ||
137 | We'll look at assertions and more complex expressions later. | ||
138 | |||
139 | Note that in general regexps cannot be used to check for balanced | ||
140 | brackets or tags. For example if you want to match an opening html | ||
141 | \c <b> and its closing \c </b> you can only use a regexp if you | ||
142 | know that these tags are not nested; the html fragment, \c{<b>bold | ||
143 | <b>bolder</b></b>} will not match as expected. If you know the | ||
144 | maximum level of nesting it is possible to create a regexp that | ||
145 | will match correctly, but for an unknown level of nesting, regexps | ||
146 | will fail. | ||
147 | |||
148 | We'll start by writing a regexp to match integers in the range 0 | ||
149 | to 99. We will require at least one digit so we will start with | ||
150 | <b>[0-9]{1,1}</b> which means match a digit exactly once. This | ||
151 | regexp alone will match integers in the range 0 to 9. To match one | ||
152 | or two digits we can increase the maximum number of occurrences so | ||
153 | the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at | ||
154 | least once and at most twice. However, this regexp as it stands | ||
155 | will not match correctly. This regexp will match one or two digits | ||
156 | \e within a string. To ensure that we match against the whole | ||
157 | string we must use the anchor assertions. We need <b>^</b> (caret) | ||
158 | which when it is the first character in the regexp means that the | ||
159 | regexp must match from the beginning of the string. And we also | ||
160 | need <b>$</b> (dollar) which when it is the last character in the | ||
161 | regexp means that the regexp must match until the end of the | ||
162 | string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that | ||
163 | assertions, such as <b>^</b> and <b>$</b>, do not match any | ||
164 | characters. | ||
165 | |||
166 | If you've seen regexps elsewhere they may have looked different from | ||
167 | the ones above. This is because some sets of characters and some | ||
168 | quantifiers are so common that they have special symbols to | ||
169 | represent them. <b>[0-9]</b> can be replaced with the symbol | ||
170 | <b>\d</b>. The quantifier to match exactly one occurrence, | ||
171 | <b>{1,1}</b>, can be replaced with the expression itself. This means | ||
172 | that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0 | ||
173 | to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of | ||
174 | writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the | ||
175 | string match a digit followed by zero or one digits. In practice | ||
176 | most people would write it <b>^\d\d?$</b>. The <b>?</b> is a | ||
177 | shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no | ||
178 | occurrences a maximum of one occurrence. This is used to make an | ||
179 | expression optional. The regexp <b>^\d\d?$</b> means "from the | ||
180 | beginning of the string match one digit followed by zero or one | ||
181 | digits and then the end of the string". | ||
182 | |||
183 | Our second example is matching the words 'mail', 'letter' or | ||
184 | 'correspondence' but without matching 'email', 'mailman', | ||
185 | 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In | ||
186 | full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since | ||
187 | each expression itself is automatically quantified by <b>{1,1}</b> | ||
188 | we can simply write this as <b>mail</b>; an 'm' followed by an 'a' | ||
189 | followed by an 'i' followed by an 'l'. The symbol '|' (bar) is | ||
190 | used for \e alternation, so our regexp now becomes | ||
191 | <b>mail|letter|correspondence</b> which means match 'mail' \e or | ||
192 | 'letter' \e or 'correspondence'. Whilst this regexp will find the | ||
193 | words we want it will also find words we don't want such as | ||
194 | 'email'. We will start by putting our regexp in parentheses, | ||
195 | <b>(mail|letter|correspondence)</b>. Parentheses have two effects, | ||
196 | firstly they group expressions together and secondly they identify | ||
197 | parts of the regexp that we wish to \link #capturing-text capture | ||
198 | \endlink. Our regexp still matches any of the three words but now | ||
199 | they are grouped together as a unit. This is useful for building | ||
200 | up more complex regexps. It is also useful because it allows us to | ||
201 | examine which of the words actually matched. We need to use | ||
202 | another assertion, this time <b>\b</b> "word boundary": | ||
203 | <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match | ||
204 | a word boundary followed by the expression in parentheses followed | ||
205 | by another word boundary". The <b>\b</b> assertion matches at a \e | ||
206 | position in the regexp not a \e character in the regexp. A word | ||
207 | boundary is any non-word character such as a space a newline or | ||
208 | the beginning or end of the string. | ||
209 | |||
210 | For our third example we want to replace ampersands with the HTML | ||
211 | entity '\&'. The regexp to match is simple: <b>\&</b>, i.e. | ||
212 | match one ampersand. Unfortunately this will mess up our text if | ||
213 | some of the ampersands have already been turned into HTML | ||
214 | entities. So what we really want to say is replace an ampersand | ||
215 | providing it is not followed by 'amp;'. For this we need the | ||
216 | negative lookahead assertion and our regexp becomes: | ||
217 | <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced | ||
218 | with '(?!' and finishes at the ')'. It means that the text it | ||
219 | contains, 'amp;' in our example, must \e not follow the expression | ||
220 | that preceeds it. | ||
221 | |||
222 | Regexps provide a rich language that can be used in a variety of | ||
223 | ways. For example suppose we want to count all the occurrences of | ||
224 | 'Eric' and 'Eirik' in a string. Two valid regexps to match these | ||
225 | are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need | ||
226 | the word boundary '\b' so we don't get 'Ericsson' etc. The second | ||
227 | regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric' | ||
228 | and 'Eirik'. | ||
229 | |||
230 | We will implement some the examples above in the | ||
231 | \link #code-examples code examples \endlink section. | ||
232 | |||
233 | \target characters-and-abbreviations-for-sets-of-characters | ||
234 | \section1 Characters and Abbreviations for Sets of Characters | ||
235 | |||
236 | \table | ||
237 | \header \i Element \i Meaning | ||
238 | \row \i <b>c</b> | ||
239 | \i Any character represents itself unless it has a special | ||
240 | regexp meaning. Thus <b>c</b> matches the character \e c. | ||
241 | \row \i <b>\\c</b> | ||
242 | \i A character that follows a backslash matches the character | ||
243 | itself except where mentioned below. For example if you | ||
244 | wished to match a literal caret at the beginning of a string | ||
245 | you would write <b>\^</b>. | ||
246 | \row \i <b>\\a</b> | ||
247 | \i This matches the ASCII bell character (BEL, 0x07). | ||
248 | \row \i <b>\\f</b> | ||
249 | \i This matches the ASCII form feed character (FF, 0x0C). | ||
250 | \row \i <b>\\n</b> | ||
251 | \i This matches the ASCII line feed character (LF, 0x0A, Unix newline). | ||
252 | \row \i <b>\\r</b> | ||
253 | \i This matches the ASCII carriage return character (CR, 0x0D). | ||
254 | \row \i <b>\\t</b> | ||
255 | \i This matches the ASCII horizontal tab character (HT, 0x09). | ||
256 | \row \i <b>\\v</b> | ||
257 | \i This matches the ASCII vertical tab character (VT, 0x0B). | ||
258 | \row \i <b>\\xhhhh</b> | ||
259 | \i This matches the Unicode character corresponding to the | ||
260 | hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo | ||
261 | (i.e., \zero ooo) matches the ASCII/Latin-1 character | ||
262 | corresponding to the octal number ooo (between 0 and 0377). | ||
263 | \row \i <b>. (dot)</b> | ||
264 | \i This matches any character (including newline). | ||
265 | \row \i <b>\\d</b> | ||
266 | \i This matches a digit (see QChar::isDigit()). | ||
267 | \row \i <b>\\D</b> | ||
268 | \i This matches a non-digit. | ||
269 | \row \i <b>\\s</b> | ||
270 | \i This matches a whitespace (see QChar::isSpace()). | ||
271 | \row \i <b>\\S</b> | ||
272 | \i This matches a non-whitespace. | ||
273 | \row \i <b>\\w</b> | ||
274 | \i This matches a word character (see QChar::isLetterOrNumber()). | ||
275 | \row \i <b>\\W</b> | ||
276 | \i This matches a non-word character. | ||
277 | \row \i <b>\\n</b> | ||
278 | \i The n-th \link #capturing-text backreference \endlink, | ||
279 | e.g. \1, \2, etc. | ||
280 | \endtable | ||
281 | |||
282 | \e {Note that the C++ compiler transforms backslashes in strings | ||
283 | so to include a <b>\\</b> in a regexp you will need to enter it | ||
284 | twice, i.e. <b>\\\\</b>.} | ||
285 | |||
286 | \target sets-of-characters | ||
287 | \section1 Sets of Characters | ||
288 | |||
289 | Square brackets are used to match any character in the set of | ||
290 | characters contained within the square brackets. All the character | ||
291 | set abbreviations described above can be used within square | ||
292 | brackets. Apart from the character set abbreviations and the | ||
293 | following two exceptions no characters have special meanings in | ||
294 | square brackets. | ||
295 | |||
296 | \table | ||
297 | \row \i <b>^</b> | ||
298 | \i The caret negates the character set if it occurs as the | ||
299 | first character, i.e. immediately after the opening square | ||
300 | bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c', | ||
301 | but <b>[^abc]</b> matches anything \e except 'a' or 'b' or | ||
302 | 'c'. | ||
303 | \row \i <b>-</b> | ||
304 | \i The dash is used to indicate a range of characters, for | ||
305 | example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'. | ||
306 | \endtable | ||
307 | |||
308 | Using the predefined character set abbreviations is more portable | ||
309 | than using character ranges across platforms and languages. For | ||
310 | example, <b>[0-9]</b> matches a digit in Western alphabets but | ||
311 | <b>\d</b> matches a digit in \e any alphabet. | ||
312 | |||
313 | Note that in most regexp literature sets of characters are called | ||
314 | "character classes". | ||
315 | |||
316 | \target quantifiers | ||
317 | \section1 Quantifiers | ||
318 | |||
319 | By default an expression is automatically quantified by | ||
320 | <b>{1,1}</b>, i.e. it should occur exactly once. In the following | ||
321 | list <b>\e {E}</b> stands for any expression. An expression is a | ||
322 | character or an abbreviation for a set of characters or a set of | ||
323 | characters in square brackets or any parenthesised expression. | ||
324 | |||
325 | \table | ||
326 | \row \i <b>\e {E}?</b> | ||
327 | \i Matches zero or one occurrence of \e E. This quantifier | ||
328 | means "the previous expression is optional" since it will | ||
329 | match whether or not the expression occurs in the string. It | ||
330 | is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b> | ||
331 | will match 'dent' and 'dents'. | ||
332 | |||
333 | \row \i <b>\e {E}+</b> | ||
334 | \i Matches one or more occurrences of \e E. This is the same | ||
335 | as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match | ||
336 | '0', '00', '000', etc. | ||
337 | |||
338 | \row \i <b>\e {E}*</b> | ||
339 | \i Matches zero or more occurrences of \e E. This is the same | ||
340 | as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often | ||
341 | used by a mistake. Since it matches \e zero or more | ||
342 | occurrences it will match no occurrences at all. For example | ||
343 | if we want to match strings that end in whitespace and use | ||
344 | the regexp <b>\s*$</b> we would get a match on every string. | ||
345 | This is because we have said find zero or more whitespace | ||
346 | followed by the end of string, so even strings that don't end | ||
347 | in whitespace will match. The regexp we want in this case is | ||
348 | <b>\s+$</b> to match strings that have at least one | ||
349 | whitespace at the end. | ||
350 | |||
351 | \row \i <b>\e {E}{n}</b> | ||
352 | \i Matches exactly \e n occurrences of the expression. This | ||
353 | is the same as repeating the expression \e n times. For | ||
354 | example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also | ||
355 | the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>. | ||
356 | |||
357 | \row \i <b>\e {E}{n,}</b> | ||
358 | \i Matches at least \e n occurrences of the expression. This | ||
359 | is the same as <b>\e {E}{n,MAXINT}</b>. | ||
360 | |||
361 | \row \i <b>\e {E}{,m}</b> | ||
362 | \i Matches at most \e m occurrences of the expression. This | ||
363 | is the same as <b>\e {E}{0,m}</b>. | ||
364 | |||
365 | \row \i <b>\e {E}{n,m}</b> | ||
366 | \i Matches at least \e n occurrences of the expression and at | ||
367 | most \e m occurrences of the expression. | ||
368 | \endtable | ||
369 | |||
370 | (MAXINT is implementation dependent but will not be smaller than | ||
371 | 1024.) | ||
372 | |||
373 | If we wish to apply a quantifier to more than just the preceding | ||
374 | character we can use parentheses to group characters together in | ||
375 | an expression. For example, <b>tag+</b> matches a 't' followed by | ||
376 | an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches | ||
377 | at least one occurrence of 'tag'. | ||
378 | |||
379 | Note that quantifiers are "greedy". They will match as much text | ||
380 | as they can. For example, <b>0+</b> will match as many zeros as it | ||
381 | can from the first zero it finds, e.g. '2.<u>000</u>5'. | ||
382 | Quantifiers can be made non-greedy, see setMinimal(). | ||
383 | |||
384 | \target capturing-text | ||
385 | \section1 Capturing Text | ||
386 | |||
387 | Parentheses allow us to group elements together so that we can | ||
388 | quantify and capture them. For example if we have the expression | ||
389 | <b>mail|letter|correspondence</b> that matches a string we know | ||
390 | that \e one of the words matched but not which one. Using | ||
391 | parentheses allows us to "capture" whatever is matched within | ||
392 | their bounds, so if we used <b>(mail|letter|correspondence)</b> | ||
393 | and matched this regexp against the string "I sent you some email" | ||
394 | we can use the cap() or capturedTexts() functions to extract the | ||
395 | matched characters, in this case 'mail'. | ||
396 | |||
397 | We can use captured text within the regexp itself. To refer to the | ||
398 | captured text we use \e backreferences which are indexed from 1, | ||
399 | the same as for cap(). For example we could search for duplicate | ||
400 | words in a string using <b>\b(\w+)\W+\1\b</b> which means match a | ||
401 | word boundary followed by one or more word characters followed by | ||
402 | one or more non-word characters followed by the same text as the | ||
403 | first parenthesised expression followed by a word boundary. | ||
404 | |||
405 | If we want to use parentheses purely for grouping and not for | ||
406 | capturing we can use the non-capturing syntax, e.g. | ||
407 | <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and | ||
408 | end ')'. In this example we match either 'green' or 'blue' but we | ||
409 | do not capture the match so we only know whether or not we matched | ||
410 | but not which color we actually found. Using non-capturing | ||
411 | parentheses is more efficient than using capturing parentheses | ||
412 | since the regexp engine has to do less book-keeping. | ||
413 | |||
414 | Both capturing and non-capturing parentheses may be nested. | ||
415 | |||
416 | \target assertions | ||
417 | \section1 Assertions | ||
418 | |||
419 | Assertions make some statement about the text at the point where | ||
420 | they occur in the regexp but they do not match any characters. In | ||
421 | the following list <b>\e {E}</b> stands for any expression. | ||
422 | |||
423 | \table | ||
424 | \row \i <b>^</b> | ||
425 | \i The caret signifies the beginning of the string. If you | ||
426 | wish to match a literal \c{^} you must escape it by | ||
427 | writing \c{\\^}. For example, <b>^#include</b> will only | ||
428 | match strings which \e begin with the characters '#include'. | ||
429 | (When the caret is the first character of a character set it | ||
430 | has a special meaning, see \link #sets-of-characters Sets of | ||
431 | Characters \endlink.) | ||
432 | |||
433 | \row \i <b>$</b> | ||
434 | \i The dollar signifies the end of the string. For example | ||
435 | <b>\d\s*$</b> will match strings which end with a digit | ||
436 | optionally followed by whitespace. If you wish to match a | ||
437 | literal \c{$} you must escape it by writing | ||
438 | \c{\\$}. | ||
439 | |||
440 | \row \i <b>\\b</b> | ||
441 | \i A word boundary. For example the regexp | ||
442 | <b>\\bOK\\b</b> means match immediately after a word | ||
443 | boundary (e.g. start of string or whitespace) the letter 'O' | ||
444 | then the letter 'K' immediately before another word boundary | ||
445 | (e.g. end of string or whitespace). But note that the | ||
446 | assertion does not actually match any whitespace so if we | ||
447 | write <b>(\\bOK\\b)</b> and we have a match it will only | ||
448 | contain 'OK' even if the string is "Its <u>OK</u> now". | ||
449 | |||
450 | \row \i <b>\\B</b> | ||
451 | \i A non-word boundary. This assertion is true wherever | ||
452 | <b>\\b</b> is false. For example if we searched for | ||
453 | <b>\\Bon\\B</b> in "Left on" the match would fail (space | ||
454 | and end of string aren't non-word boundaries), but it would | ||
455 | match in "t<u>on</u>ne". | ||
456 | |||
457 | \row \i <b>(?=\e E)</b> | ||
458 | \i Positive lookahead. This assertion is true if the | ||
459 | expression matches at this point in the regexp. For example, | ||
460 | <b>const(?=\\s+char)</b> matches 'const' whenever it is | ||
461 | followed by 'char', as in 'static <u>const</u> char *'. | ||
462 | (Compare with <b>const\\s+char</b>, which matches 'static | ||
463 | <u>const char</u> *'.) | ||
464 | |||
465 | \row \i <b>(?!\e E)</b> | ||
466 | \i Negative lookahead. This assertion is true if the | ||
467 | expression does not match at this point in the regexp. For | ||
468 | example, <b>const(?!\\s+char)</b> matches 'const' \e except | ||
469 | when it is followed by 'char'. | ||
470 | \endtable | ||
471 | |||
472 | \target wildcard-matching | ||
473 | \section1 Wildcard Matching (globbing) | ||
474 | |||
475 | Most command shells such as \e bash or \e cmd.exe support "file | ||
476 | globbing", the ability to identify a group of files by using | ||
477 | wildcards. The setWildcard() function is used to switch between | ||
478 | regexp and wildcard mode. Wildcard matching is much simpler than | ||
479 | full regexps and has only four features: | ||
480 | |||
481 | \table | ||
482 | \row \i <b>c</b> | ||
483 | \i Any character represents itself apart from those mentioned | ||
484 | below. Thus <b>c</b> matches the character \e c. | ||
485 | \row \i <b>?</b> | ||
486 | \i This matches any single character. It is the same as | ||
487 | <b>.</b> in full regexps. | ||
488 | \row \i <b>*</b> | ||
489 | \i This matches zero or more of any characters. It is the | ||
490 | same as <b>.*</b> in full regexps. | ||
491 | \row \i <b>[...]</b> | ||
492 | \i Sets of characters can be represented in square brackets, | ||
493 | similar to full regexps. Within the character class, like | ||
494 | outside, backslash has no special meaning. | ||
495 | \endtable | ||
496 | |||
497 | For example if we are in wildcard mode and have strings which | ||
498 | contain filenames we could identify HTML files with <b>*.html</b>. | ||
499 | This will match zero or more characters followed by a dot followed | ||
500 | by 'h', 't', 'm' and 'l'. | ||
501 | |||
502 | \target perl-users | ||
503 | \section1 Notes for Perl Users | ||
504 | |||
505 | Most of the character class abbreviations supported by Perl are | ||
506 | supported by QRegExp, see \link | ||
507 | #characters-and-abbreviations-for-sets-of-characters characters | ||
508 | and abbreviations for sets of characters \endlink. | ||
509 | |||
510 | In QRegExp, apart from within character classes, \c{^} always | ||
511 | signifies the start of the string, so carets must always be | ||
512 | escaped unless used for that purpose. In Perl the meaning of caret | ||
513 | varies automagically depending on where it occurs so escaping it | ||
514 | is rarely necessary. The same applies to \c{$} which in | ||
515 | QRegExp always signifies the end of the string. | ||
516 | |||
517 | QRegExp's quantifiers are the same as Perl's greedy quantifiers. | ||
518 | Non-greedy matching cannot be applied to individual quantifiers, | ||
519 | but can be applied to all the quantifiers in the pattern. For | ||
520 | example, to match the Perl regexp <b>ro+?m</b> requires: | ||
521 | \code | ||
522 | QRegExp rx( "ro+m" ); | ||
523 | rx.setMinimal( TRUE ); | ||
524 | \endcode | ||
525 | |||
526 | The equivalent of Perl's \c{/i} option is | ||
527 | setCaseSensitive(FALSE). | ||
528 | |||
529 | Perl's \c{/g} option can be emulated using a \link | ||
530 | #cap_in_a_loop loop \endlink. | ||
531 | |||
532 | In QRegExp <b>.</b> matches any character, therefore all QRegExp | ||
533 | regexps have the equivalent of Perl's \c{/s} option. QRegExp | ||
534 | does not have an equivalent to Perl's \c{/m} option, but this | ||
535 | can be emulated in various ways for example by splitting the input | ||
536 | into lines or by looping with a regexp that searches for newlines. | ||
537 | |||
538 | Because QRegExp is string oriented there are no \A, \Z or \z | ||
539 | assertions. The \G assertion is not supported but can be emulated | ||
540 | in a loop. | ||
541 | |||
542 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp | ||
543 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, | ||
544 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or | ||
545 | capturedTexts()[2], etc. | ||
546 | |||
547 | To substitute a pattern use QString::replace(). | ||
548 | |||
549 | Perl's extended \c{/x} syntax is not supported, nor are | ||
550 | regexp comments (?#comment) or directives, e.g. (?i). | ||
551 | |||
552 | Both zero-width positive and zero-width negative lookahead | ||
553 | assertions (?=pattern) and (?!pattern) are supported with the same | ||
554 | syntax as Perl. Perl's lookbehind assertions, "independent" | ||
555 | subexpressions and conditional expressions are not supported. | ||
556 | |||
557 | Non-capturing parentheses are also supported, with the same | ||
558 | (?:pattern) syntax. | ||
559 | |||
560 | See QStringList::split() and QStringList::join() for equivalents | ||
561 | to Perl's split and join functions. | ||
562 | |||
563 | Note: because C++ transforms \\'s they must be written \e twice in | ||
564 | code, e.g. <b>\\b</b> must be written <b>\\\\b</b>. | ||
565 | |||
566 | \target code-examples | ||
567 | \section1 Code Examples | ||
568 | |||
569 | \code | ||
570 | QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99 | ||
571 | rx.search( "123" ); // returns -1 (no match) | ||
572 | rx.search( "-6" ); // returns -1 (no match) | ||
573 | rx.search( "6" ); // returns 0 (matched as position 0) | ||
574 | \endcode | ||
575 | |||
576 | The third string matches '<u>6</u>'. This is a simple validation | ||
577 | regexp for integers in the range 0 to 99. | ||
578 | |||
579 | \code | ||
580 | QRegExp rx( "^\\S+$" ); // match strings without whitespace | ||
581 | rx.search( "Hello world" ); // returns -1 (no match) | ||
582 | rx.search( "This_is-OK" ); // returns 0 (matched at position 0) | ||
583 | \endcode | ||
584 | |||
585 | The second string matches '<u>This_is-OK</u>'. We've used the | ||
586 | character set abbreviation '\S' (non-whitespace) and the anchors | ||
587 | to match strings which contain no whitespace. | ||
588 | |||
589 | In the following example we match strings containing 'mail' or | ||
590 | 'letter' or 'correspondence' but only match whole words i.e. not | ||
591 | 'email' | ||
592 | |||
593 | \code | ||
594 | QRegExp rx( "\\b(mail|letter|correspondence)\\b" ); | ||
595 | rx.search( "I sent you an email" ); // returns -1 (no match) | ||
596 | rx.search( "Please write the letter" ); // returns 17 | ||
597 | \endcode | ||
598 | |||
599 | The second string matches "Please write the <u>letter</u>". The | ||
600 | word 'letter' is also captured (because of the parentheses). We | ||
601 | can see what text we've captured like this: | ||
602 | |||
603 | \code | ||
604 | QString captured = rx.cap( 1 ); // captured == "letter" | ||
605 | \endcode | ||
606 | |||
607 | This will capture the text from the first set of capturing | ||
608 | parentheses (counting capturing left parentheses from left to | ||
609 | right). The parentheses are counted from 1 since cap( 0 ) is the | ||
610 | whole matched regexp (equivalent to '&' in most regexp engines). | ||
611 | |||
612 | \code | ||
613 | QRegExp rx( "&(?!amp;)" ); // match ampersands but not & | ||
614 | QString line1 = "This & that"; | ||
615 | line1.replace( rx, "&" ); | ||
616 | // line1 == "This & that" | ||
617 | QString line2 = "His & hers & theirs"; | ||
618 | line2.replace( rx, "&" ); | ||
619 | // line2 == "His & hers & theirs" | ||
620 | \endcode | ||
621 | |||
622 | Here we've passed the QRegExp to QString's replace() function to | ||
623 | replace the matched text with new text. | ||
624 | |||
625 | \code | ||
626 | QString str = "One Eric another Eirik, and an Ericsson." | ||
627 | " How many Eiriks, Eric?"; | ||
628 | QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik | ||
629 | int pos = 0; // where we are in the string | ||
630 | int count = 0; // how many Eric and Eirik's we've counted | ||
631 | while ( pos >= 0 ) { | ||
632 | pos = rx.search( str, pos ); | ||
633 | if ( pos >= 0 ) { | ||
634 | pos++; // move along in str | ||
635 | count++; // count our Eric or Eirik | ||
636 | } | ||
637 | } | ||
638 | \endcode | ||
639 | |||
640 | We've used the search() function to repeatedly match the regexp in | ||
641 | the string. Note that instead of moving forward by one character | ||
642 | at a time \c pos++ we could have written \c {pos += | ||
643 | rx.matchedLength()} to skip over the already matched string. The | ||
644 | count will equal 3, matching 'One <u>Eric</u> another | ||
645 | <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it | ||
646 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded | ||
647 | by non-word boundaries. | ||
648 | |||
649 | One common use of regexps is to split lines of delimited data into | ||
650 | their component fields. | ||
651 | |||
652 | \code | ||
653 | str = "Trolltech AS\twww.trolltech.com\tNorway"; | ||
654 | QString company, web, country; | ||
655 | rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" ); | ||
656 | if ( rx.search( str ) != -1 ) { | ||
657 | company = rx.cap( 1 ); | ||
658 | web = rx.cap( 2 ); | ||
659 | country = rx.cap( 3 ); | ||
660 | } | ||
661 | \endcode | ||
662 | |||
663 | In this example our input lines have the format company name, web | ||
664 | address and country. Unfortunately the regexp is rather long and | ||
665 | not very versatile -- the code will break if we add any more | ||
666 | fields. A simpler and better solution is to look for the | ||
667 | separator, '\t' in this case, and take the surrounding text. The | ||
668 | QStringList split() function can take a separator string or regexp | ||
669 | as an argument and split a string accordingly. | ||
670 | |||
671 | \code | ||
672 | QStringList field = QStringList::split( "\t", str ); | ||
673 | \endcode | ||
674 | |||
675 | Here field[0] is the company, field[1] the web address and so on. | ||
676 | |||
677 | To imitate the matching of a shell we can use wildcard mode. | ||
678 | |||
679 | \code | ||
680 | QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything | ||
681 | rx.setWildcard( TRUE ); // now it's a valid wildcard regexp | ||
682 | rx.search( "index.html" ); // returns 0 (matched at position 0) | ||
683 | rx.search( "default.htm" ); // returns -1 (no match) | ||
684 | rx.search( "readme.txt" ); // returns -1 (no match) | ||
685 | \endcode | ||
686 | |||
687 | Wildcard matching can be convenient because of its simplicity, but | ||
688 | any wildcard regexp can be defined using full regexps, e.g. | ||
689 | <b>.*\.html$</b>. Notice that we can't match both \c .html and \c | ||
690 | .htm files with a wildcard unless we use <b>*.htm*</b> which will | ||
691 | also match 'test.html.bak'. A full regexp gives us the precision | ||
692 | we need, <b>.*\\.html?$</b>. | ||
693 | |||
694 | QRegExp can match case insensitively using setCaseSensitive(), and | ||
695 | can use non-greedy matching, see setMinimal(). By default QRegExp | ||
696 | uses full regexps but this can be changed with setWildcard(). | ||
697 | Searching can be forward with search() or backward with | ||
698 | searchRev(). Captured text can be accessed using capturedTexts() | ||
699 | which returns a string list of all captured strings, or using | ||
700 | cap() which returns the captured string for the given index. The | ||
701 | pos() function takes a match index and returns the position in the | ||
702 | string where the match was made (or -1 if there was no match). | ||
703 | |||
704 | \sa QRegExpValidator QString QStringList | ||
705 | |||
706 | \target member-function-documentation | ||
707 | */ | ||
708 | |||
709 | const int NumBadChars = 64; | ||
710 | #define BadChar( ch ) ( (ch).unicode() % NumBadChars ) | ||
711 | |||
712 | const int NoOccurrence = INT_MAX; | ||
713 | const int EmptyCapture = INT_MAX; | ||
714 | const int InftyLen = INT_MAX; | ||
715 | const int InftyRep = 1025; | ||
716 | const int EOS = -1; | ||
717 | |||
718 | /* | ||
719 | Merges two QMemArrays of ints and puts the result into the first one. | ||
720 | */ | ||
721 | static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b ) | ||
722 | { | ||
723 | int asize = a->size(); | ||
724 | int bsize = b.size(); | ||
725 | if ( asize == 0 ) { | ||
726 | *a = b.copy(); | ||
727 | #ifndef QT_NO_REGEXP_OPTIM | ||
728 | } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) { | ||
729 | a->resize( asize + 1 ); | ||
730 | (*a)[asize] = b[0]; | ||
731 | #endif | ||
732 | } else if ( bsize >= 1 ) { | ||
733 | int csize = asize + bsize; | ||
734 | QMemArray<int> c( csize ); | ||
735 | int i = 0, j = 0, k = 0; | ||
736 | while ( i < asize ) { | ||
737 | if ( j < bsize ) { | ||
738 | if ( (*a)[i] == b[j] ) { | ||
739 | i++; | ||
740 | csize--; | ||
741 | } else if ( (*a)[i] < b[j] ) { | ||
742 | c[k++] = (*a)[i++]; | ||
743 | } else { | ||
744 | c[k++] = b[j++]; | ||
745 | } | ||
746 | } else { | ||
747 | memcpy( c.data() + k, (*a).data() + i, | ||
748 | (asize - i) * sizeof(int) ); | ||
749 | break; | ||
750 | } | ||
751 | } | ||
752 | c.resize( csize ); | ||
753 | if ( j < bsize ) | ||
754 | memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) ); | ||
755 | *a = c; | ||
756 | } | ||
757 | } | ||
758 | |||
759 | /* | ||
760 | Merges two disjoint QMaps of (int, int) pairs and puts the result into the | ||
761 | first one. | ||
762 | */ | ||
763 | static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b ) | ||
764 | { | ||
765 | QMap<int, int>::ConstIterator it; | ||
766 | for ( it = b.begin(); it != b.end(); ++it ) | ||
767 | a->insert( it.key(), *it ); | ||
768 | } | ||
769 | |||
770 | /* | ||
771 | Returns the value associated to key k in QMap m of (int, int) pairs, or 0 if | ||
772 | no such value is explicitly present. | ||
773 | */ | ||
774 | static int at( const QMap<int, int>& m, int k ) | ||
775 | { | ||
776 | QMap<int, int>::ConstIterator it = m.find( k ); | ||
777 | if ( it == m.end() ) | ||
778 | return 0; | ||
779 | else | ||
780 | return *it; | ||
781 | } | ||
782 | |||
783 | #ifndef QT_NO_REGEXP_WILDCARD | ||
784 | /* | ||
785 | Translates a wildcard pattern to an equivalent regular expression pattern | ||
786 | (e.g., *.cpp to .*\.cpp). | ||
787 | */ | ||
788 | static QString wc2rx( const QString& wc_str ) | ||
789 | { | ||
790 | int wclen = wc_str.length(); | ||
791 | QString rx = QString::fromLatin1( "" ); | ||
792 | int i = 0; | ||
793 | const QChar *wc = wc_str.unicode(); | ||
794 | while ( i < wclen ) { | ||
795 | QChar c = wc[i++]; | ||
796 | switch ( c.unicode() ) { | ||
797 | case '*': | ||
798 | rx += QString::fromLatin1( ".*" ); | ||
799 | break; | ||
800 | case '?': | ||
801 | rx += QChar( '.' ); | ||
802 | break; | ||
803 | case '$': | ||
804 | case '(': | ||
805 | case ')': | ||
806 | case '+': | ||
807 | case '.': | ||
808 | case '\\': | ||
809 | case '^': | ||
810 | case '{': | ||
811 | case '|': | ||
812 | case '}': | ||
813 | rx += QChar( '\\' ); | ||
814 | rx += c; | ||
815 | break; | ||
816 | case '[': | ||
817 | rx += c; | ||
818 | if ( wc[i] == QChar('^') ) | ||
819 | rx += wc[i++]; | ||
820 | if ( i < wclen ) { | ||
821 | if ( rx[i] == ']' ) | ||
822 | rx += wc[i++]; | ||
823 | while ( i < wclen && wc[i] != QChar(']') ) { | ||
824 | if ( wc[i] == '\\' ) | ||
825 | rx += QChar( '\\' ); | ||
826 | rx += wc[i++]; | ||
827 | } | ||
828 | } | ||
829 | break; | ||
830 | default: | ||
831 | rx += c; | ||
832 | } | ||
833 | } | ||
834 | return rx; | ||
835 | } | ||
836 | #endif | ||
837 | |||
838 | /* | ||
839 | The class QRegExpEngine encapsulates a modified nondeterministic | ||
840 | finite automaton (NFA). | ||
841 | */ | ||
842 | class QRegExpEngine : public QShared | ||
843 | { | ||
844 | public: | ||
845 | #ifndef QT_NO_REGEXP_CCLASS | ||
846 | /* | ||
847 | The class CharClass represents a set of characters, such as can | ||
848 | be found in regular expressions (e.g., [a-z] denotes the set | ||
849 | {a, b, ..., z}). | ||
850 | */ | ||
851 | class CharClass | ||
852 | { | ||
853 | public: | ||
854 | CharClass(); | ||
855 | CharClass( const CharClass& cc ) { operator=( cc ); } | ||
856 | |||
857 | CharClass& operator=( const CharClass& cc ); | ||
858 | |||
859 | void clear(); | ||
860 | bool negative() const { return n; } | ||
861 | void setNegative( bool negative ); | ||
862 | void addCategories( int cats ); | ||
863 | void addRange( ushort from, ushort to ); | ||
864 | void addSingleton( ushort ch ) { addRange( ch, ch ); } | ||
865 | |||
866 | bool in( QChar ch ) const; | ||
867 | #ifndef QT_NO_REGEXP_OPTIM | ||
868 | const QMemArray<int>& firstOccurrence() const { return occ1; } | ||
869 | #endif | ||
870 | |||
871 | #if defined(QT_DEBUG) | ||
872 | void dump() const; | ||
873 | #endif | ||
874 | |||
875 | private: | ||
876 | /* | ||
877 | The struct Range represents a range of characters (e.g., | ||
878 | [0-9] denotes range 48 to 57). | ||
879 | */ | ||
880 | struct Range | ||
881 | { | ||
882 | ushort from; // 48 | ||
883 | ushort to; // 57 | ||
884 | }; | ||
885 | |||
886 | int c; // character classes | ||
887 | QMemArray<Range> r; // character ranges | ||
888 | bool n; // negative? | ||
889 | #ifndef QT_NO_REGEXP_OPTIM | ||
890 | QMemArray<int> occ1; // first-occurrence array | ||
891 | #endif | ||
892 | }; | ||
893 | #else | ||
894 | struct CharClass | ||
895 | { | ||
896 | int dummy; | ||
897 | |||
898 | #ifndef QT_NO_REGEXP_OPTIM | ||
899 | CharClass() { occ1.fill( 0, NumBadChars ); } | ||
900 | |||
901 | const QMemArray<int>& firstOccurrence() const { return occ1; } | ||
902 | QMemArray<int> occ1; | ||
903 | #endif | ||
904 | }; | ||
905 | #endif | ||
906 | |||
907 | QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); } | ||
908 | QRegExpEngine( const QString& rx, bool caseSensitive ); | ||
909 | #ifndef QT_NO_REGEXP_OPTIM | ||
910 | ~QRegExpEngine(); | ||
911 | #endif | ||
912 | |||
913 | bool isValid() const { return valid; } | ||
914 | const QString& errorString() const { return yyError; } | ||
915 | bool caseSensitive() const { return cs; } | ||
916 | int numCaptures() const { return officialncap; } | ||
917 | QMemArray<int> match( const QString& str, int pos, bool minimal, | ||
918 | bool oneTest, int caretIndex ); | ||
919 | int matchedLength() const { return mmMatchedLen; } | ||
920 | |||
921 | int createState( QChar ch ); | ||
922 | int createState( const CharClass& cc ); | ||
923 | #ifndef QT_NO_REGEXP_BACKREF | ||
924 | int createState( int bref ); | ||
925 | #endif | ||
926 | |||
927 | void addCatTransitions( const QMemArray<int>& from, | ||
928 | const QMemArray<int>& to ); | ||
929 | #ifndef QT_NO_REGEXP_CAPTURE | ||
930 | void addPlusTransitions( const QMemArray<int>& from, | ||
931 | const QMemArray<int>& to, int atom ); | ||
932 | #endif | ||
933 | |||
934 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
935 | int anchorAlternation( int a, int b ); | ||
936 | int anchorConcatenation( int a, int b ); | ||
937 | #else | ||
938 | int anchorAlternation( int a, int b ) { return a & b; } | ||
939 | int anchorConcatenation( int a, int b ) { return a | b; } | ||
940 | #endif | ||
941 | void addAnchors( int from, int to, int a ); | ||
942 | |||
943 | #ifndef QT_NO_REGEXP_OPTIM | ||
944 | void setupGoodStringHeuristic( int earlyStart, int lateStart, | ||
945 | const QString& str ); | ||
946 | void setupBadCharHeuristic( int minLen, const QMemArray<int>& firstOcc ); | ||
947 | void heuristicallyChooseHeuristic(); | ||
948 | #endif | ||
949 | |||
950 | #if defined(QT_DEBUG) | ||
951 | void dump() const; | ||
952 | #endif | ||
953 | |||
954 | private: | ||
955 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; | ||
956 | |||
957 | /* | ||
958 | The struct State represents one state in a modified NFA. The | ||
959 | input characters matched are stored in the state instead of on | ||
960 | the transitions, something possible for an automaton | ||
961 | constructed from a regular expression. | ||
962 | */ | ||
963 | struct State | ||
964 | { | ||
965 | #ifndef QT_NO_REGEXP_CAPTURE | ||
966 | int atom; // which atom does this state belong to? | ||
967 | #endif | ||
968 | int match; // what does it match? (see CharClassBit and BackRefBit) | ||
969 | QMemArray<int> outs; // out-transitions | ||
970 | QMap<int, int> *reenter; // atoms reentered when transiting out | ||
971 | QMap<int, int> *anchors; // anchors met when transiting out | ||
972 | |||
973 | #ifndef QT_NO_REGEXP_CAPTURE | ||
974 | State( int a, int m ) | ||
975 | : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { } | ||
976 | #else | ||
977 | State( int m ) | ||
978 | : match( m ), reenter( 0 ), anchors( 0 ) { } | ||
979 | #endif | ||
980 | ~State() { delete reenter; delete anchors; } | ||
981 | }; | ||
982 | |||
983 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
984 | /* | ||
985 | The struct Lookahead represents a lookahead a la Perl (e.g., (?=foo) and | ||
986 | (?!bar)). | ||
987 | */ | ||
988 | struct Lookahead | ||
989 | { | ||
990 | QRegExpEngine *eng; // NFA representing the embedded regular expression | ||
991 | bool neg; // negative lookahead? | ||
992 | |||
993 | Lookahead( QRegExpEngine *eng0, bool neg0 ) | ||
994 | : eng( eng0 ), neg( neg0 ) { } | ||
995 | ~Lookahead() { delete eng; } | ||
996 | }; | ||
997 | #endif | ||
998 | |||
999 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1000 | /* | ||
1001 | The struct Atom represents one node in the hierarchy of regular | ||
1002 | expression atoms. | ||
1003 | */ | ||
1004 | struct Atom | ||
1005 | { | ||
1006 | int parent; // index of parent in array of atoms | ||
1007 | int capture; // index of capture, from 1 to ncap | ||
1008 | }; | ||
1009 | #endif | ||
1010 | |||
1011 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1012 | /* | ||
1013 | The struct AnchorAlternation represents a pair of anchors with | ||
1014 | OR semantics. | ||
1015 | */ | ||
1016 | struct AnchorAlternation | ||
1017 | { | ||
1018 | int a; // this anchor... | ||
1019 | int b; // ...or this one | ||
1020 | }; | ||
1021 | #endif | ||
1022 | |||
1023 | enum { InitialState = 0, FinalState = 1 }; | ||
1024 | void setup( bool caseSensitive ); | ||
1025 | int setupState( int match ); | ||
1026 | |||
1027 | /* | ||
1028 | Let's hope that 13 lookaheads and 14 back-references are | ||
1029 | enough. | ||
1030 | */ | ||
1031 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; | ||
1032 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, | ||
1033 | Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, | ||
1034 | Anchor_FirstLookahead = 0x00000010, | ||
1035 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, | ||
1036 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, | ||
1037 | Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, | ||
1038 | |||
1039 | Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^ | ||
1040 | ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) }; | ||
1041 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1042 | int startAtom( bool capture ); | ||
1043 | void finishAtom( int atom ) { cf = f[atom].parent; } | ||
1044 | #endif | ||
1045 | |||
1046 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1047 | int addLookahead( QRegExpEngine *eng, bool negative ); | ||
1048 | #endif | ||
1049 | |||
1050 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1051 | bool isBetterCapture( const int *begin1, const int *end1, const int *begin2, | ||
1052 | const int *end2 ); | ||
1053 | #endif | ||
1054 | bool testAnchor( int i, int a, const int *capBegin ); | ||
1055 | |||
1056 | #ifndef QT_NO_REGEXP_OPTIM | ||
1057 | bool goodStringMatch(); | ||
1058 | bool badCharMatch(); | ||
1059 | #else | ||
1060 | bool bruteMatch(); | ||
1061 | #endif | ||
1062 | bool matchHere(); | ||
1063 | |||
1064 | QPtrVector<State> s; // array of states | ||
1065 | int ns; // number of states | ||
1066 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1067 | QMemArray<Atom> f; // atom hierarchy | ||
1068 | int nf; // number of atoms | ||
1069 | int cf; // current atom | ||
1070 | #endif | ||
1071 | int officialncap; // number of captures, seen from the outside | ||
1072 | int ncap; // number of captures, seen from the inside | ||
1073 | #ifndef QT_NO_REGEXP_CCLASS | ||
1074 | QPtrVector<CharClass> cl; // array of character classes | ||
1075 | #endif | ||
1076 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1077 | QPtrVector<Lookahead> ahead; // array of lookaheads | ||
1078 | #endif | ||
1079 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1080 | QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors | ||
1081 | #endif | ||
1082 | #ifndef QT_NO_REGEXP_OPTIM | ||
1083 | bool caretAnchored; // does the regexp start with ^? | ||
1084 | #endif | ||
1085 | bool valid; // is the regular expression valid? | ||
1086 | bool cs; // case sensitive? | ||
1087 | #ifndef QT_NO_REGEXP_BACKREF | ||
1088 | int nbrefs; // number of back-references | ||
1089 | #endif | ||
1090 | |||
1091 | #ifndef QT_NO_REGEXP_OPTIM | ||
1092 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch | ||
1093 | |||
1094 | int goodEarlyStart; // the index where goodStr can first occur in a match | ||
1095 | int goodLateStart; // the index where goodStr can last occur in a match | ||
1096 | QString goodStr; // the string that any match has to contain | ||
1097 | |||
1098 | int minl; // the minimum length of a match | ||
1099 | QMemArray<int> occ1; // first-occurrence array | ||
1100 | #endif | ||
1101 | |||
1102 | /* | ||
1103 | The class Box is an abstraction for a regular expression | ||
1104 | fragment. It can also be seen as one node in the syntax tree of | ||
1105 | a regular expression with synthetized attributes. | ||
1106 | |||
1107 | It's interface is ugly for performance reasons. | ||
1108 | */ | ||
1109 | class Box | ||
1110 | { | ||
1111 | public: | ||
1112 | Box( QRegExpEngine *engine ); | ||
1113 | Box( const Box& b ) { operator=( b ); } | ||
1114 | |||
1115 | Box& operator=( const Box& b ); | ||
1116 | |||
1117 | void clear() { operator=(Box(eng)); } | ||
1118 | void set( QChar ch ); | ||
1119 | void set( const CharClass& cc ); | ||
1120 | #ifndef QT_NO_REGEXP_BACKREF | ||
1121 | void set( int bref ); | ||
1122 | #endif | ||
1123 | |||
1124 | void cat( const Box& b ); | ||
1125 | void orx( const Box& b ); | ||
1126 | void plus( int atom ); | ||
1127 | void opt(); | ||
1128 | void catAnchor( int a ); | ||
1129 | #ifndef QT_NO_REGEXP_OPTIM | ||
1130 | void setupHeuristics(); | ||
1131 | #endif | ||
1132 | |||
1133 | #if defined(QT_DEBUG) | ||
1134 | void dump() const; | ||
1135 | #endif | ||
1136 | |||
1137 | private: | ||
1138 | void addAnchorsToEngine( const Box& to ) const; | ||
1139 | |||
1140 | QRegExpEngine *eng; // the automaton under construction | ||
1141 | QMemArray<int> ls; // the left states (firstpos) | ||
1142 | QMemArray<int> rs; // the right states (lastpos) | ||
1143 | QMap<int, int> lanchors; // the left anchors | ||
1144 | QMap<int, int> ranchors; // the right anchors | ||
1145 | int skipanchors; // the anchors to match if the box is skipped | ||
1146 | |||
1147 | #ifndef QT_NO_REGEXP_OPTIM | ||
1148 | int earlyStart; // the index where str can first occur | ||
1149 | int lateStart; // the index where str can last occur | ||
1150 | QString str; // a string that has to occur in any match | ||
1151 | QString leftStr; // a string occurring at the left of this box | ||
1152 | QString rightStr; // a string occurring at the right of this box | ||
1153 | int maxl; // the maximum length of this box (possibly InftyLen) | ||
1154 | #endif | ||
1155 | |||
1156 | int minl; // the minimum length of this box | ||
1157 | #ifndef QT_NO_REGEXP_OPTIM | ||
1158 | QMemArray<int> occ1; // first-occurrence array | ||
1159 | #endif | ||
1160 | }; | ||
1161 | friend class Box; | ||
1162 | |||
1163 | /* | ||
1164 | This is the lexical analyzer for regular expressions. | ||
1165 | */ | ||
1166 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, | ||
1167 | Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, | ||
1168 | Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, | ||
1169 | Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; | ||
1170 | int getChar(); | ||
1171 | int getEscape(); | ||
1172 | #ifndef QT_NO_REGEXP_INTERVAL | ||
1173 | int getRep( int def ); | ||
1174 | #endif | ||
1175 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1176 | void skipChars( int n ); | ||
1177 | #endif | ||
1178 | void error( const char *msg ); | ||
1179 | void startTokenizer( const QChar *rx, int len ); | ||
1180 | int getToken(); | ||
1181 | |||
1182 | const QChar *yyIn; // a pointer to the input regular expression pattern | ||
1183 | int yyPos0; // the position of yyTok in the input pattern | ||
1184 | int yyPos; // the position of the next character to read | ||
1185 | int yyLen; // the length of yyIn | ||
1186 | int yyCh; // the last character read | ||
1187 | CharClass *yyCharClass; // attribute for Tok_CharClass tokens | ||
1188 | int yyMinRep; // attribute for Tok_Quantifier | ||
1189 | int yyMaxRep; // ditto | ||
1190 | QString yyError; // syntax error or overflow during parsing? | ||
1191 | |||
1192 | /* | ||
1193 | This is the syntactic analyzer for regular expressions. | ||
1194 | */ | ||
1195 | int parse( const QChar *rx, int len ); | ||
1196 | void parseAtom( Box *box ); | ||
1197 | void parseFactor( Box *box ); | ||
1198 | void parseTerm( Box *box ); | ||
1199 | void parseExpression( Box *box ); | ||
1200 | |||
1201 | int yyTok; // the last token read | ||
1202 | bool yyMayCapture; // set this to FALSE to disable capturing | ||
1203 | |||
1204 | /* | ||
1205 | This is the engine state during matching. | ||
1206 | */ | ||
1207 | const QString *mmStr; // a pointer to the input QString | ||
1208 | const QChar *mmIn; // a pointer to the input string data | ||
1209 | int mmPos; // the current position in the string | ||
1210 | int mmCaretPos; | ||
1211 | int mmLen; // the length of the input string | ||
1212 | bool mmMinimal; // minimal matching? | ||
1213 | QMemArray<int> mmCaptured; // an array of pairs (start, len) | ||
1214 | QMemArray<int> mmCapturedNoMatch; // an array of pairs (-1, -1) | ||
1215 | QMemArray<int> mmBigArray; // big QMemArray<int> array | ||
1216 | int *mmInNextStack; // is state is mmNextStack? | ||
1217 | int *mmCurStack; // stack of current states | ||
1218 | int *mmNextStack; // stack of next states | ||
1219 | int *mmCurCapBegin; // start of current states' captures | ||
1220 | int *mmNextCapBegin; // start of next states' captures | ||
1221 | int *mmCurCapEnd; // end of current states' captures | ||
1222 | int *mmNextCapEnd; // end of next states' captures | ||
1223 | int *mmTempCapBegin; // start of temporary captures | ||
1224 | int *mmTempCapEnd; // end of temporary captures | ||
1225 | int *mmCapBegin; // start of captures for a next state | ||
1226 | int *mmCapEnd; // end of captures for a next state | ||
1227 | int *mmSlideTab; // bump-along slide table for bad-character heuristic | ||
1228 | int mmSlideTabSize; // size of slide table | ||
1229 | #ifndef QT_NO_REGEXP_BACKREF | ||
1230 | QIntDict<int> mmSleeping; // dictionary of back-reference sleepers | ||
1231 | #endif | ||
1232 | int mmMatchLen; // length of match | ||
1233 | int mmMatchedLen; // length of partial match | ||
1234 | }; | ||
1235 | |||
1236 | QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive ) | ||
1237 | #ifndef QT_NO_REGEXP_BACKREF | ||
1238 | : mmSleeping( 101 ) | ||
1239 | #endif | ||
1240 | { | ||
1241 | setup( caseSensitive ); | ||
1242 | valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() ); | ||
1243 | if ( !valid ) | ||
1244 | error( RXERR_LEFTDELIM ); | ||
1245 | } | ||
1246 | |||
1247 | #ifndef QT_NO_REGEXP_OPTIM | ||
1248 | QRegExpEngine::~QRegExpEngine() | ||
1249 | { | ||
1250 | } | ||
1251 | #endif | ||
1252 | |||
1253 | /* | ||
1254 | Tries to match in str and returns an array of (begin, length) pairs | ||
1255 | for captured text. If there is no match, all pairs are (-1, -1). | ||
1256 | */ | ||
1257 | QMemArray<int> QRegExpEngine::match( const QString& str, int pos, bool minimal, | ||
1258 | bool oneTest, int caretIndex ) | ||
1259 | { | ||
1260 | mmStr = &str; | ||
1261 | mmIn = str.unicode(); | ||
1262 | if ( mmIn == 0 ) | ||
1263 | mmIn = &QChar::null; | ||
1264 | mmPos = pos; | ||
1265 | mmCaretPos = caretIndex; | ||
1266 | mmLen = str.length(); | ||
1267 | mmMinimal = minimal; | ||
1268 | mmMatchLen = 0; | ||
1269 | mmMatchedLen = 0; | ||
1270 | |||
1271 | bool matched = FALSE; | ||
1272 | if ( valid && mmPos >= 0 && mmPos <= mmLen ) { | ||
1273 | #ifndef QT_NO_REGEXP_OPTIM | ||
1274 | if ( oneTest ) { | ||
1275 | matched = matchHere(); | ||
1276 | } else { | ||
1277 | if ( mmPos <= mmLen - minl ) { | ||
1278 | if ( caretAnchored ) { | ||
1279 | matched = matchHere(); | ||
1280 | } else if ( useGoodStringHeuristic ) { | ||
1281 | matched = goodStringMatch(); | ||
1282 | } else { | ||
1283 | matched = badCharMatch(); | ||
1284 | } | ||
1285 | } | ||
1286 | } | ||
1287 | #else | ||
1288 | matched = oneTest ? matchHere() : bruteMatch(); | ||
1289 | #endif | ||
1290 | } | ||
1291 | |||
1292 | if ( matched ) { | ||
1293 | mmCaptured.detach(); | ||
1294 | mmCaptured[0] = mmPos; | ||
1295 | mmCaptured[1] = mmMatchLen; | ||
1296 | for ( int j = 0; j < officialncap; j++ ) { | ||
1297 | int len = mmCapEnd[j] - mmCapBegin[j]; | ||
1298 | mmCaptured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; | ||
1299 | mmCaptured[2 + 2 * j + 1] = len; | ||
1300 | } | ||
1301 | return mmCaptured; | ||
1302 | } else { | ||
1303 | return mmCapturedNoMatch; | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | /* | ||
1308 | The three following functions add one state to the automaton and | ||
1309 | return the number of the state. | ||
1310 | */ | ||
1311 | |||
1312 | int QRegExpEngine::createState( QChar ch ) | ||
1313 | { | ||
1314 | return setupState( ch.unicode() ); | ||
1315 | } | ||
1316 | |||
1317 | int QRegExpEngine::createState( const CharClass& cc ) | ||
1318 | { | ||
1319 | #ifndef QT_NO_REGEXP_CCLASS | ||
1320 | int n = cl.size(); | ||
1321 | cl.resize( n + 1 ); | ||
1322 | cl.insert( n, new CharClass(cc) ); | ||
1323 | return setupState( CharClassBit | n ); | ||
1324 | #else | ||
1325 | Q_UNUSED( cc ); | ||
1326 | return setupState( CharClassBit ); | ||
1327 | #endif | ||
1328 | } | ||
1329 | |||
1330 | #ifndef QT_NO_REGEXP_BACKREF | ||
1331 | int QRegExpEngine::createState( int bref ) | ||
1332 | { | ||
1333 | if ( bref > nbrefs ) { | ||
1334 | nbrefs = bref; | ||
1335 | if ( nbrefs > MaxBackRefs ) { | ||
1336 | error( RXERR_LIMIT ); | ||
1337 | return 0; | ||
1338 | } | ||
1339 | } | ||
1340 | return setupState( BackRefBit | bref ); | ||
1341 | } | ||
1342 | #endif | ||
1343 | |||
1344 | /* | ||
1345 | The two following functions add a transition between all pairs of | ||
1346 | states (i, j) where i is fond in from, and j is found in to. | ||
1347 | |||
1348 | Cat-transitions are distinguished from plus-transitions for | ||
1349 | capturing. | ||
1350 | */ | ||
1351 | |||
1352 | void QRegExpEngine::addCatTransitions( const QMemArray<int>& from, | ||
1353 | const QMemArray<int>& to ) | ||
1354 | { | ||
1355 | for ( int i = 0; i < (int) from.size(); i++ ) { | ||
1356 | State *st = s[from[i]]; | ||
1357 | mergeInto( &st->outs, to ); | ||
1358 | } | ||
1359 | } | ||
1360 | |||
1361 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1362 | void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from, | ||
1363 | const QMemArray<int>& to, int atom ) | ||
1364 | { | ||
1365 | for ( int i = 0; i < (int) from.size(); i++ ) { | ||
1366 | State *st = s[from[i]]; | ||
1367 | QMemArray<int> oldOuts = st->outs.copy(); | ||
1368 | mergeInto( &st->outs, to ); | ||
1369 | if ( f[atom].capture >= 0 ) { | ||
1370 | if ( st->reenter == 0 ) | ||
1371 | st->reenter = new QMap<int, int>; | ||
1372 | for ( int j = 0; j < (int) to.size(); j++ ) { | ||
1373 | if ( !st->reenter->contains(to[j]) && | ||
1374 | oldOuts.bsearch(to[j]) < 0 ) | ||
1375 | st->reenter->insert( to[j], atom ); | ||
1376 | } | ||
1377 | } | ||
1378 | } | ||
1379 | } | ||
1380 | #endif | ||
1381 | |||
1382 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1383 | /* | ||
1384 | Returns an anchor that means a OR b. | ||
1385 | */ | ||
1386 | int QRegExpEngine::anchorAlternation( int a, int b ) | ||
1387 | { | ||
1388 | if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 ) | ||
1389 | return a & b; | ||
1390 | |||
1391 | int n = aa.size(); | ||
1392 | #ifndef QT_NO_REGEXP_OPTIM | ||
1393 | if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b ) | ||
1394 | return Anchor_Alternation | ( n - 1 ); | ||
1395 | #endif | ||
1396 | |||
1397 | aa.resize( n + 1 ); | ||
1398 | aa[n].a = a; | ||
1399 | aa[n].b = b; | ||
1400 | return Anchor_Alternation | n; | ||
1401 | } | ||
1402 | |||
1403 | /* | ||
1404 | Returns an anchor that means a AND b. | ||
1405 | */ | ||
1406 | int QRegExpEngine::anchorConcatenation( int a, int b ) | ||
1407 | { | ||
1408 | if ( ((a | b) & Anchor_Alternation) == 0 ) | ||
1409 | return a | b; | ||
1410 | if ( (b & Anchor_Alternation) != 0 ) | ||
1411 | qSwap( a, b ); | ||
1412 | |||
1413 | int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b ); | ||
1414 | int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b ); | ||
1415 | return anchorAlternation( aprime, bprime ); | ||
1416 | } | ||
1417 | #endif | ||
1418 | |||
1419 | /* | ||
1420 | Adds anchor a on a transition caracterised by its from state and | ||
1421 | its to state. | ||
1422 | */ | ||
1423 | void QRegExpEngine::addAnchors( int from, int to, int a ) | ||
1424 | { | ||
1425 | State *st = s[from]; | ||
1426 | if ( st->anchors == 0 ) | ||
1427 | st->anchors = new QMap<int, int>; | ||
1428 | if ( st->anchors->contains(to) ) | ||
1429 | a = anchorAlternation( (*st->anchors)[to], a ); | ||
1430 | st->anchors->insert( to, a ); | ||
1431 | } | ||
1432 | |||
1433 | #ifndef QT_NO_REGEXP_OPTIM | ||
1434 | /* | ||
1435 | The two following functions provide the engine with the information | ||
1436 | needed by its matching heuristics. | ||
1437 | */ | ||
1438 | |||
1439 | void QRegExpEngine::setupGoodStringHeuristic( int earlyStart, int lateStart, | ||
1440 | const QString& str ) | ||
1441 | { | ||
1442 | goodEarlyStart = earlyStart; | ||
1443 | goodLateStart = lateStart; | ||
1444 | goodStr = cs ? str : str.lower(); | ||
1445 | } | ||
1446 | |||
1447 | void QRegExpEngine::setupBadCharHeuristic( int minLen, | ||
1448 | const QMemArray<int>& firstOcc ) | ||
1449 | { | ||
1450 | minl = minLen; | ||
1451 | if ( cs ) { | ||
1452 | occ1 = firstOcc; | ||
1453 | } else { | ||
1454 | occ1.fill( 0, NumBadChars ); | ||
1455 | } | ||
1456 | } | ||
1457 | |||
1458 | /* | ||
1459 | This function chooses between the good-string and the bad-character | ||
1460 | heuristics. It computes two scores and chooses the heuristic with | ||
1461 | the highest score. | ||
1462 | |||
1463 | Here are some common-sense constraints on the scores that should be | ||
1464 | respected if the formulas are ever modified: (1) If goodStr is | ||
1465 | empty, the good-string heuristic scores 0. (2) If the search is | ||
1466 | case insensitive, the good-string heuristic should be used, unless | ||
1467 | it scores 0. (Case insensitivity turns all entries of occ1 to 0.) | ||
1468 | (3) If (goodLateStart - goodEarlyStart) is big, the good-string | ||
1469 | heuristic should score less. | ||
1470 | */ | ||
1471 | void QRegExpEngine::heuristicallyChooseHeuristic() | ||
1472 | { | ||
1473 | int i; | ||
1474 | |||
1475 | if ( minl == 0 ) | ||
1476 | return; | ||
1477 | |||
1478 | /* | ||
1479 | Magic formula: The good string has to constitute a good | ||
1480 | proportion of the minimum-length string, and appear at a | ||
1481 | more-or-less known index. | ||
1482 | */ | ||
1483 | int goodStringScore = ( 64 * goodStr.length() / minl ) - | ||
1484 | ( goodLateStart - goodEarlyStart ); | ||
1485 | |||
1486 | /* | ||
1487 | Less magic formula: We pick a couple of characters at random, | ||
1488 | and check whether they are good or bad. | ||
1489 | */ | ||
1490 | int badCharScore = 0; | ||
1491 | int step = QMAX( 1, NumBadChars / 32 ); | ||
1492 | for ( i = 1; i < NumBadChars; i += step ) { | ||
1493 | if ( occ1[i] == NoOccurrence ) | ||
1494 | badCharScore += minl; | ||
1495 | else | ||
1496 | badCharScore += occ1[i]; | ||
1497 | } | ||
1498 | badCharScore /= minl; | ||
1499 | |||
1500 | useGoodStringHeuristic = ( goodStringScore > badCharScore ); | ||
1501 | } | ||
1502 | #endif | ||
1503 | |||
1504 | #if defined(QT_DEBUG) | ||
1505 | void QRegExpEngine::dump() const | ||
1506 | { | ||
1507 | int i, j; | ||
1508 | qDebug( "Case %ssensitive engine", cs ? "" : "in" ); | ||
1509 | qDebug( " States" ); | ||
1510 | for ( i = 0; i < ns; i++ ) { | ||
1511 | qDebug( " %d%s", i, | ||
1512 | i == InitialState ? " (initial)" : | ||
1513 | i == FinalState ? " (final)" : "" ); | ||
1514 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1515 | qDebug( " in atom %d", s[i]->atom ); | ||
1516 | #endif | ||
1517 | int m = s[i]->match; | ||
1518 | if ( (m & CharClassBit) != 0 ) { | ||
1519 | qDebug( " match character class %d", m ^ CharClassBit ); | ||
1520 | #ifndef QT_NO_REGEXP_CCLASS | ||
1521 | cl[m ^ CharClassBit]->dump(); | ||
1522 | #else | ||
1523 | qDebug( " negative character class" ); | ||
1524 | #endif | ||
1525 | } else if ( (m & BackRefBit) != 0 ) { | ||
1526 | qDebug( " match back-reference %d", m ^ BackRefBit ); | ||
1527 | } else if ( m >= 0x20 && m <= 0x7e ) { | ||
1528 | qDebug( " match 0x%.4x (%c)", m, m ); | ||
1529 | } else { | ||
1530 | qDebug( " match 0x%.4x", m ); | ||
1531 | } | ||
1532 | for ( j = 0; j < (int) s[i]->outs.size(); j++ ) { | ||
1533 | int next = s[i]->outs[j]; | ||
1534 | qDebug( " -> %d", next ); | ||
1535 | if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) ) | ||
1536 | qDebug( " [reenter %d]", (*s[i]->reenter)[next] ); | ||
1537 | if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 ) | ||
1538 | qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] ); | ||
1539 | } | ||
1540 | } | ||
1541 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1542 | if ( nf > 0 ) { | ||
1543 | qDebug( " Atom Parent Capture" ); | ||
1544 | for ( i = 0; i < nf; i++ ) | ||
1545 | qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture ); | ||
1546 | } | ||
1547 | #endif | ||
1548 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1549 | for ( i = 0; i < (int) aa.size(); i++ ) | ||
1550 | qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, | ||
1551 | aa[i].b ); | ||
1552 | #endif | ||
1553 | } | ||
1554 | #endif | ||
1555 | |||
1556 | void QRegExpEngine::setup( bool caseSensitive ) | ||
1557 | { | ||
1558 | s.setAutoDelete( TRUE ); | ||
1559 | s.resize( 32 ); | ||
1560 | ns = 0; | ||
1561 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1562 | f.resize( 32 ); | ||
1563 | nf = 0; | ||
1564 | cf = -1; | ||
1565 | #endif | ||
1566 | officialncap = 0; | ||
1567 | ncap = 0; | ||
1568 | #ifndef QT_NO_REGEXP_CCLASS | ||
1569 | cl.setAutoDelete( TRUE ); | ||
1570 | #endif | ||
1571 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1572 | ahead.setAutoDelete( TRUE ); | ||
1573 | #endif | ||
1574 | #ifndef QT_NO_REGEXP_OPTIM | ||
1575 | caretAnchored = TRUE; | ||
1576 | #endif | ||
1577 | valid = FALSE; | ||
1578 | cs = caseSensitive; | ||
1579 | #ifndef QT_NO_REGEXP_BACKREF | ||
1580 | nbrefs = 0; | ||
1581 | #endif | ||
1582 | #ifndef QT_NO_REGEXP_OPTIM | ||
1583 | useGoodStringHeuristic = FALSE; | ||
1584 | minl = 0; | ||
1585 | occ1.fill( 0, NumBadChars ); | ||
1586 | #endif | ||
1587 | mmCapturedNoMatch.fill( -1, 2 ); | ||
1588 | } | ||
1589 | |||
1590 | int QRegExpEngine::setupState( int match ) | ||
1591 | { | ||
1592 | if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() ) | ||
1593 | s.resize( (ns + 1) << 1 ); | ||
1594 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1595 | s.insert( ns, new State(cf, match) ); | ||
1596 | #else | ||
1597 | s.insert( ns, new State(match) ); | ||
1598 | #endif | ||
1599 | return ns++; | ||
1600 | } | ||
1601 | |||
1602 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1603 | /* | ||
1604 | Functions startAtom() and finishAtom() should be called to delimit | ||
1605 | atoms. When a state is created, it is assigned to the current atom. | ||
1606 | The information is later used for capturing. | ||
1607 | */ | ||
1608 | int QRegExpEngine::startAtom( bool capture ) | ||
1609 | { | ||
1610 | if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() ) | ||
1611 | f.resize( (nf + 1) << 1 ); | ||
1612 | f[nf].parent = cf; | ||
1613 | cf = nf++; | ||
1614 | f[cf].capture = capture ? ncap++ : -1; | ||
1615 | return cf; | ||
1616 | } | ||
1617 | #endif | ||
1618 | |||
1619 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1620 | /* | ||
1621 | Creates a lookahead anchor. | ||
1622 | */ | ||
1623 | int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative ) | ||
1624 | { | ||
1625 | int n = ahead.size(); | ||
1626 | if ( n == MaxLookaheads ) { | ||
1627 | error( RXERR_LIMIT ); | ||
1628 | return 0; | ||
1629 | } | ||
1630 | ahead.resize( n + 1 ); | ||
1631 | ahead.insert( n, new Lookahead(eng, negative) ); | ||
1632 | return Anchor_FirstLookahead << n; | ||
1633 | } | ||
1634 | #endif | ||
1635 | |||
1636 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1637 | /* | ||
1638 | We want the longest leftmost captures. | ||
1639 | */ | ||
1640 | bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1, | ||
1641 | const int *begin2, const int *end2 ) | ||
1642 | { | ||
1643 | for ( int i = 0; i < ncap; i++ ) { | ||
1644 | int delta = begin2[i] - begin1[i]; // it has to start early... | ||
1645 | if ( delta == 0 ) | ||
1646 | delta = end1[i] - end2[i]; // ...and end late (like a party) | ||
1647 | |||
1648 | if ( delta != 0 ) | ||
1649 | return delta > 0; | ||
1650 | } | ||
1651 | return FALSE; | ||
1652 | } | ||
1653 | #endif | ||
1654 | |||
1655 | /* | ||
1656 | Returns TRUE if anchor a matches at position mmPos + i in the input | ||
1657 | string, otherwise FALSE. | ||
1658 | */ | ||
1659 | bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin ) | ||
1660 | { | ||
1661 | int j; | ||
1662 | |||
1663 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1664 | if ( (a & Anchor_Alternation) != 0 ) { | ||
1665 | return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) || | ||
1666 | testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin ); | ||
1667 | } | ||
1668 | #endif | ||
1669 | |||
1670 | if ( (a & Anchor_Caret) != 0 ) { | ||
1671 | if ( mmPos + i != mmCaretPos ) | ||
1672 | return FALSE; | ||
1673 | } | ||
1674 | if ( (a & Anchor_Dollar) != 0 ) { | ||
1675 | if ( mmPos + i != mmLen ) | ||
1676 | return FALSE; | ||
1677 | } | ||
1678 | #ifndef QT_NO_REGEXP_ESCAPE | ||
1679 | if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) { | ||
1680 | bool before = FALSE; | ||
1681 | bool after = FALSE; | ||
1682 | if ( mmPos + i != 0 ) | ||
1683 | before = mmIn[mmPos + i - 1].isLetterOrNumber(); | ||
1684 | if ( mmPos + i != mmLen ) | ||
1685 | after = mmIn[mmPos + i].isLetterOrNumber(); | ||
1686 | if ( (a & Anchor_Word) != 0 && (before == after) ) | ||
1687 | return FALSE; | ||
1688 | if ( (a & Anchor_NonWord) != 0 && (before != after) ) | ||
1689 | return FALSE; | ||
1690 | } | ||
1691 | #endif | ||
1692 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1693 | bool catchx = TRUE; | ||
1694 | |||
1695 | if ( (a & Anchor_LookaheadMask) != 0 ) { | ||
1696 | QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i, | ||
1697 | mmLen - mmPos - i ); | ||
1698 | for ( j = 0; j < (int) ahead.size(); j++ ) { | ||
1699 | if ( (a & (Anchor_FirstLookahead << j)) != 0 ) { | ||
1700 | catchx = ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE, | ||
1701 | mmCaretPos - mmPos - i )[0] == 0; | ||
1702 | if ( catchx == ahead[j]->neg ) | ||
1703 | return FALSE; | ||
1704 | } | ||
1705 | } | ||
1706 | } | ||
1707 | #endif | ||
1708 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1709 | #ifndef QT_NO_REGEXP_BACKREF | ||
1710 | for ( j = 0; j < nbrefs; j++ ) { | ||
1711 | if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) { | ||
1712 | if ( capBegin[j] != EmptyCapture ) | ||
1713 | return FALSE; | ||
1714 | } | ||
1715 | } | ||
1716 | #endif | ||
1717 | #endif | ||
1718 | return TRUE; | ||
1719 | } | ||
1720 | |||
1721 | #ifndef QT_NO_REGEXP_OPTIM | ||
1722 | /* | ||
1723 | The three following functions are what Jeffrey Friedl would call | ||
1724 | transmissions (or bump-alongs). Using one or the other should make | ||
1725 | no difference except in performance. | ||
1726 | */ | ||
1727 | |||
1728 | bool QRegExpEngine::goodStringMatch() | ||
1729 | { | ||
1730 | int k = mmPos + goodEarlyStart; | ||
1731 | |||
1732 | while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) { | ||
1733 | int from = k - goodLateStart; | ||
1734 | int to = k - goodEarlyStart; | ||
1735 | if ( from > mmPos ) | ||
1736 | mmPos = from; | ||
1737 | |||
1738 | while ( mmPos <= to ) { | ||
1739 | if ( matchHere() ) | ||
1740 | return TRUE; | ||
1741 | mmPos++; | ||
1742 | } | ||
1743 | k++; | ||
1744 | } | ||
1745 | return FALSE; | ||
1746 | } | ||
1747 | |||
1748 | bool QRegExpEngine::badCharMatch() | ||
1749 | { | ||
1750 | int slideHead = 0; | ||
1751 | int slideNext = 0; | ||
1752 | int i; | ||
1753 | int lastPos = mmLen - minl; | ||
1754 | memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) ); | ||
1755 | |||
1756 | /* | ||
1757 | Set up the slide table, used for the bad-character heuristic, | ||
1758 | using the table of first occurrence of each character. | ||
1759 | */ | ||
1760 | for ( i = 0; i < minl; i++ ) { | ||
1761 | int sk = occ1[BadChar(mmIn[mmPos + i])]; | ||
1762 | if ( sk == NoOccurrence ) | ||
1763 | sk = i + 1; | ||
1764 | if ( sk > 0 ) { | ||
1765 | int k = i + 1 - sk; | ||
1766 | if ( k < 0 ) { | ||
1767 | sk = i + 1; | ||
1768 | k = 0; | ||
1769 | } | ||
1770 | if ( sk > mmSlideTab[k] ) | ||
1771 | mmSlideTab[k] = sk; | ||
1772 | } | ||
1773 | } | ||
1774 | |||
1775 | if ( mmPos > lastPos ) | ||
1776 | return FALSE; | ||
1777 | |||
1778 | for ( ;; ) { | ||
1779 | if ( ++slideNext >= mmSlideTabSize ) | ||
1780 | slideNext = 0; | ||
1781 | if ( mmSlideTab[slideHead] > 0 ) { | ||
1782 | if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] ) | ||
1783 | mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1; | ||
1784 | mmSlideTab[slideHead] = 0; | ||
1785 | } else { | ||
1786 | if ( matchHere() ) | ||
1787 | return TRUE; | ||
1788 | } | ||
1789 | |||
1790 | if ( mmPos == lastPos ) | ||
1791 | break; | ||
1792 | |||
1793 | /* | ||
1794 | Update the slide table. This code has much in common with | ||
1795 | the initialization code. | ||
1796 | */ | ||
1797 | int sk = occ1[BadChar(mmIn[mmPos + minl])]; | ||
1798 | if ( sk == NoOccurrence ) { | ||
1799 | mmSlideTab[slideNext] = minl; | ||
1800 | } else if ( sk > 0 ) { | ||
1801 | int k = slideNext + minl - sk; | ||
1802 | if ( k >= mmSlideTabSize ) | ||
1803 | k -= mmSlideTabSize; | ||
1804 | if ( sk > mmSlideTab[k] ) | ||
1805 | mmSlideTab[k] = sk; | ||
1806 | } | ||
1807 | slideHead = slideNext; | ||
1808 | mmPos++; | ||
1809 | } | ||
1810 | return FALSE; | ||
1811 | } | ||
1812 | #else | ||
1813 | bool QRegExpEngine::bruteMatch() | ||
1814 | { | ||
1815 | while ( mmPos <= mmLen ) { | ||
1816 | if ( matchHere() ) | ||
1817 | return TRUE; | ||
1818 | mmPos++; | ||
1819 | } | ||
1820 | return FALSE; | ||
1821 | } | ||
1822 | #endif | ||
1823 | |||
1824 | /* | ||
1825 | Here's the core of the engine. It tries to do a match here and now. | ||
1826 | */ | ||
1827 | bool QRegExpEngine::matchHere() | ||
1828 | { | ||
1829 | int ncur = 1, nnext = 0; | ||
1830 | int i = 0, j, k, m; | ||
1831 | bool stop = FALSE; | ||
1832 | |||
1833 | mmMatchLen = -1; | ||
1834 | mmMatchedLen = -1; | ||
1835 | mmCurStack[0] = InitialState; | ||
1836 | |||
1837 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1838 | if ( ncap > 0 ) { | ||
1839 | for ( j = 0; j < ncap; j++ ) { | ||
1840 | mmCurCapBegin[j] = EmptyCapture; | ||
1841 | mmCurCapEnd[j] = EmptyCapture; | ||
1842 | } | ||
1843 | } | ||
1844 | #endif | ||
1845 | |||
1846 | #ifndef QT_NO_REGEXP_BACKREF | ||
1847 | int *zzZ = 0; | ||
1848 | |||
1849 | while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos && | ||
1850 | !stop ) | ||
1851 | #else | ||
1852 | while ( ncur > 0 && i <= mmLen - mmPos && !stop ) | ||
1853 | #endif | ||
1854 | { | ||
1855 | int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0; | ||
1856 | for ( j = 0; j < ncur; j++ ) { | ||
1857 | int cur = mmCurStack[j]; | ||
1858 | State *scur = s[cur]; | ||
1859 | QMemArray<int>& outs = scur->outs; | ||
1860 | for ( k = 0; k < (int) outs.size(); k++ ) { | ||
1861 | int next = outs[k]; | ||
1862 | State *snext = s[next]; | ||
1863 | bool in = TRUE; | ||
1864 | #ifndef QT_NO_REGEXP_BACKREF | ||
1865 | int needSomeSleep = 0; | ||
1866 | #endif | ||
1867 | |||
1868 | /* | ||
1869 | First, check if the anchors are anchored properly. | ||
1870 | */ | ||
1871 | if ( scur->anchors != 0 ) { | ||
1872 | int a = at( *scur->anchors, next ); | ||
1873 | if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) ) | ||
1874 | in = FALSE; | ||
1875 | } | ||
1876 | /* | ||
1877 | If indeed they are, check if the input character is | ||
1878 | correct for this transition. | ||
1879 | */ | ||
1880 | if ( in ) { | ||
1881 | m = snext->match; | ||
1882 | if ( (m & (CharClassBit | BackRefBit)) == 0 ) { | ||
1883 | if ( cs ) | ||
1884 | in = ( m == ch ); | ||
1885 | else | ||
1886 | in = ( QChar(m).lower() == QChar(ch).lower() ); | ||
1887 | } else if ( next == FinalState ) { | ||
1888 | mmMatchLen = i; | ||
1889 | stop = mmMinimal; | ||
1890 | in = TRUE; | ||
1891 | } else if ( (m & CharClassBit) != 0 ) { | ||
1892 | #ifndef QT_NO_REGEXP_CCLASS | ||
1893 | const CharClass *cc = cl[m ^ CharClassBit]; | ||
1894 | if ( cs ) | ||
1895 | in = cc->in( ch ); | ||
1896 | else if ( cc->negative() ) | ||
1897 | in = cc->in( QChar(ch).lower() ) && | ||
1898 | cc->in( QChar(ch).upper() ); | ||
1899 | else | ||
1900 | in = cc->in( QChar(ch).lower() ) || | ||
1901 | cc->in( QChar(ch).upper() ); | ||
1902 | #endif | ||
1903 | #ifndef QT_NO_REGEXP_BACKREF | ||
1904 | } else { /* ( (m & BackRefBit) != 0 ) */ | ||
1905 | int bref = m ^ BackRefBit; | ||
1906 | int ell = j * ncap + ( bref - 1 ); | ||
1907 | |||
1908 | in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture; | ||
1909 | if ( in ) { | ||
1910 | if ( cs ) | ||
1911 | in = ( mmIn[mmPos + mmCurCapBegin[ell]] | ||
1912 | == QChar(ch) ); | ||
1913 | else | ||
1914 | in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower() | ||
1915 | == QChar(ch).lower() ); | ||
1916 | } | ||
1917 | |||
1918 | if ( in ) { | ||
1919 | int delta; | ||
1920 | if ( mmCurCapEnd[ell] == EmptyCapture ) | ||
1921 | delta = i - mmCurCapBegin[ell]; | ||
1922 | else | ||
1923 | delta = mmCurCapEnd[ell] - mmCurCapBegin[ell]; | ||
1924 | |||
1925 | in = ( delta <= mmLen - (mmPos + i) ); | ||
1926 | if ( in && delta > 1 ) { | ||
1927 | int n = 1; | ||
1928 | if ( cs ) { | ||
1929 | while ( n < delta ) { | ||
1930 | if ( mmIn[mmPos + | ||
1931 | mmCurCapBegin[ell] + n] != | ||
1932 | mmIn[mmPos + i + n] ) | ||
1933 | break; | ||
1934 | n++; | ||
1935 | } | ||
1936 | } else { | ||
1937 | while ( n < delta ) { | ||
1938 | QChar a = mmIn[mmPos + | ||
1939 | mmCurCapBegin[ell] + n]; | ||
1940 | QChar b = mmIn[mmPos + i + n]; | ||
1941 | if ( a.lower() != b.lower() ) | ||
1942 | break; | ||
1943 | n++; | ||
1944 | } | ||
1945 | } | ||
1946 | in = ( n == delta ); | ||
1947 | if ( in ) | ||
1948 | needSomeSleep = delta - 1; | ||
1949 | } | ||
1950 | } | ||
1951 | #endif | ||
1952 | } | ||
1953 | } | ||
1954 | |||
1955 | /* | ||
1956 | We must now update our data structures. | ||
1957 | */ | ||
1958 | if ( in ) { | ||
1959 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1960 | int *capBegin, *capEnd; | ||
1961 | #endif | ||
1962 | /* | ||
1963 | If the next state was not encountered yet, all | ||
1964 | is fine. | ||
1965 | */ | ||
1966 | if ( (m = mmInNextStack[next]) == -1 ) { | ||
1967 | m = nnext++; | ||
1968 | mmNextStack[m] = next; | ||
1969 | mmInNextStack[next] = m; | ||
1970 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1971 | capBegin = mmNextCapBegin + m * ncap; | ||
1972 | capEnd = mmNextCapEnd + m * ncap; | ||
1973 | |||
1974 | /* | ||
1975 | Otherwise, we'll first maintain captures in | ||
1976 | temporary arrays, and decide at the end whether | ||
1977 | it's best to keep the previous capture zones or | ||
1978 | the new ones. | ||
1979 | */ | ||
1980 | } else { | ||
1981 | capBegin = mmTempCapBegin; | ||
1982 | capEnd = mmTempCapEnd; | ||
1983 | #endif | ||
1984 | } | ||
1985 | |||
1986 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1987 | /* | ||
1988 | Updating the capture zones is much of a task. | ||
1989 | */ | ||
1990 | if ( ncap > 0 ) { | ||
1991 | memcpy( capBegin, mmCurCapBegin + j * ncap, | ||
1992 | ncap * sizeof(int) ); | ||
1993 | memcpy( capEnd, mmCurCapEnd + j * ncap, | ||
1994 | ncap * sizeof(int) ); | ||
1995 | int c = scur->atom, n = snext->atom; | ||
1996 | int p = -1, q = -1; | ||
1997 | int cap; | ||
1998 | |||
1999 | /* | ||
2000 | Lemma 1. For any x in the range [0..nf), we | ||
2001 | have f[x].parent < x. | ||
2002 | |||
2003 | Proof. By looking at startAtom(), it is | ||
2004 | clear that cf < nf holds all the time, and | ||
2005 | thus that f[nf].parent < nf. | ||
2006 | */ | ||
2007 | |||
2008 | /* | ||
2009 | If we are reentering an atom, we empty all | ||
2010 | capture zones inside it. | ||
2011 | */ | ||
2012 | if ( scur->reenter != 0 && | ||
2013 | (q = at(*scur->reenter, next)) != 0 ) { | ||
2014 | QBitArray b; | ||
2015 | b.fill( FALSE, nf ); | ||
2016 | b.setBit( q, TRUE ); | ||
2017 | for ( int ell = q + 1; ell < nf; ell++ ) { | ||
2018 | if ( b.testBit(f[ell].parent) ) { | ||
2019 | b.setBit( ell, TRUE ); | ||
2020 | cap = f[ell].capture; | ||
2021 | if ( cap >= 0 ) { | ||
2022 | capBegin[cap] = EmptyCapture; | ||
2023 | capEnd[cap] = EmptyCapture; | ||
2024 | } | ||
2025 | } | ||
2026 | } | ||
2027 | p = f[q].parent; | ||
2028 | |||
2029 | /* | ||
2030 | Otherwise, close the capture zones we are | ||
2031 | leaving. We are leaving f[c].capture, | ||
2032 | f[f[c].parent].capture, | ||
2033 | f[f[f[c].parent].parent].capture, ..., | ||
2034 | until f[x].capture, with x such that | ||
2035 | f[x].parent is the youngest common ancestor | ||
2036 | for c and n. | ||
2037 | |||
2038 | We go up along c's and n's ancestry until | ||
2039 | we find x. | ||
2040 | */ | ||
2041 | } else { | ||
2042 | p = c; | ||
2043 | q = n; | ||
2044 | while ( p != q ) { | ||
2045 | if ( p > q ) { | ||
2046 | cap = f[p].capture; | ||
2047 | if ( cap >= 0 ) { | ||
2048 | if ( capBegin[cap] == i ) { | ||
2049 | capBegin[cap] = EmptyCapture; | ||
2050 | capEnd[cap] = EmptyCapture; | ||
2051 | } else { | ||
2052 | capEnd[cap] = i; | ||
2053 | } | ||
2054 | } | ||
2055 | p = f[p].parent; | ||
2056 | } else { | ||
2057 | q = f[q].parent; | ||
2058 | } | ||
2059 | } | ||
2060 | } | ||
2061 | |||
2062 | /* | ||
2063 | In any case, we now open the capture zones | ||
2064 | we are entering. We work upwards from n | ||
2065 | until we reach p (the parent of the atom we | ||
2066 | reenter or the youngest common ancestor). | ||
2067 | */ | ||
2068 | while ( n > p ) { | ||
2069 | cap = f[n].capture; | ||
2070 | if ( cap >= 0 ) { | ||
2071 | capBegin[cap] = i; | ||
2072 | capEnd[cap] = EmptyCapture; | ||
2073 | } | ||
2074 | n = f[n].parent; | ||
2075 | } | ||
2076 | /* | ||
2077 | If the next state was already in | ||
2078 | mmNextStack, we must choose carefully which | ||
2079 | capture zones we want to keep. | ||
2080 | */ | ||
2081 | if ( capBegin == mmTempCapBegin && | ||
2082 | isBetterCapture(capBegin, capEnd, | ||
2083 | mmNextCapBegin + m * ncap, | ||
2084 | mmNextCapEnd + m * ncap) ) { | ||
2085 | memcpy( mmNextCapBegin + m * ncap, capBegin, | ||
2086 | ncap * sizeof(int) ); | ||
2087 | memcpy( mmNextCapEnd + m * ncap, capEnd, | ||
2088 | ncap * sizeof(int) ); | ||
2089 | } | ||
2090 | } | ||
2091 | #ifndef QT_NO_REGEXP_BACKREF | ||
2092 | /* | ||
2093 | We are done with updating the capture zones. | ||
2094 | It's now time to put the next state to sleep, | ||
2095 | if it needs to, and to remove it from | ||
2096 | mmNextStack. | ||
2097 | */ | ||
2098 | if ( needSomeSleep > 0 ) { | ||
2099 | zzZ = new int[1 + 2 * ncap]; | ||
2100 | zzZ[0] = next; | ||
2101 | if ( ncap > 0 ) { | ||
2102 | memcpy( zzZ + 1, capBegin, ncap * sizeof(int) ); | ||
2103 | memcpy( zzZ + 1 + ncap, capEnd, | ||
2104 | ncap * sizeof(int) ); | ||
2105 | } | ||
2106 | mmInNextStack[mmNextStack[--nnext]] = -1; | ||
2107 | mmSleeping.insert( i + needSomeSleep, zzZ ); | ||
2108 | } | ||
2109 | #endif | ||
2110 | #endif | ||
2111 | } | ||
2112 | } | ||
2113 | } | ||
2114 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2115 | /* | ||
2116 | If we reached the final state, hurray! Copy the captured | ||
2117 | zone. | ||
2118 | */ | ||
2119 | if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) { | ||
2120 | memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) ); | ||
2121 | memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) ); | ||
2122 | } | ||
2123 | #ifndef QT_NO_REGEXP_BACKREF | ||
2124 | /* | ||
2125 | It's time to wake up the sleepers. | ||
2126 | */ | ||
2127 | if ( !mmSleeping.isEmpty() ) { | ||
2128 | while ( (zzZ = mmSleeping.take(i)) != 0 ) { | ||
2129 | int next = zzZ[0]; | ||
2130 | int *capBegin = zzZ + 1; | ||
2131 | int *capEnd = zzZ + 1 + ncap; | ||
2132 | bool copyOver = TRUE; | ||
2133 | |||
2134 | if ( (m = mmInNextStack[zzZ[0]]) == -1 ) { | ||
2135 | m = nnext++; | ||
2136 | mmNextStack[m] = next; | ||
2137 | mmInNextStack[next] = m; | ||
2138 | } else { | ||
2139 | copyOver = isBetterCapture( mmNextCapBegin + m * ncap, | ||
2140 | mmNextCapEnd + m * ncap, | ||
2141 | capBegin, capEnd ); | ||
2142 | } | ||
2143 | if ( copyOver ) { | ||
2144 | memcpy( mmNextCapBegin + m * ncap, capBegin, | ||
2145 | ncap * sizeof(int) ); | ||
2146 | memcpy( mmNextCapEnd + m * ncap, capEnd, | ||
2147 | ncap * sizeof(int) ); | ||
2148 | } | ||
2149 | delete[] zzZ; | ||
2150 | } | ||
2151 | } | ||
2152 | #endif | ||
2153 | #endif | ||
2154 | for ( j = 0; j < nnext; j++ ) | ||
2155 | mmInNextStack[mmNextStack[j]] = -1; | ||
2156 | |||
2157 | // avoid needless iteration that confuses mmMatchedLen | ||
2158 | if ( nnext == 1 && mmNextStack[0] == FinalState | ||
2159 | #ifndef QT_NO_REGEXP_BACKREF | ||
2160 | && mmSleeping.isEmpty() | ||
2161 | #endif | ||
2162 | ) | ||
2163 | stop = TRUE; | ||
2164 | |||
2165 | qSwap( mmCurStack, mmNextStack ); | ||
2166 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2167 | qSwap( mmCurCapBegin, mmNextCapBegin ); | ||
2168 | qSwap( mmCurCapEnd, mmNextCapEnd ); | ||
2169 | #endif | ||
2170 | ncur = nnext; | ||
2171 | nnext = 0; | ||
2172 | i++; | ||
2173 | } | ||
2174 | |||
2175 | #ifndef QT_NO_REGEXP_BACKREF | ||
2176 | /* | ||
2177 | If minimal matching is enabled, we might have some sleepers | ||
2178 | left. | ||
2179 | */ | ||
2180 | while ( !mmSleeping.isEmpty() ) { | ||
2181 | zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) ); | ||
2182 | delete[] zzZ; | ||
2183 | } | ||
2184 | #endif | ||
2185 | |||
2186 | mmMatchedLen = i - 1; | ||
2187 | return ( mmMatchLen >= 0 ); | ||
2188 | } | ||
2189 | |||
2190 | #ifndef QT_NO_REGEXP_CCLASS | ||
2191 | |||
2192 | QRegExpEngine::CharClass::CharClass() | ||
2193 | : c( 0 ), n( FALSE ) | ||
2194 | { | ||
2195 | #ifndef QT_NO_REGEXP_OPTIM | ||
2196 | occ1.fill( NoOccurrence, NumBadChars ); | ||
2197 | #endif | ||
2198 | } | ||
2199 | |||
2200 | QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=( | ||
2201 | const CharClass& cc ) | ||
2202 | { | ||
2203 | c = cc.c; | ||
2204 | r = cc.r.copy(); | ||
2205 | n = cc.n; | ||
2206 | #ifndef QT_NO_REGEXP_OPTIM | ||
2207 | occ1 = cc.occ1; | ||
2208 | #endif | ||
2209 | return *this; | ||
2210 | } | ||
2211 | |||
2212 | void QRegExpEngine::CharClass::clear() | ||
2213 | { | ||
2214 | c = 0; | ||
2215 | r.resize( 0 ); | ||
2216 | n = FALSE; | ||
2217 | } | ||
2218 | |||
2219 | void QRegExpEngine::CharClass::setNegative( bool negative ) | ||
2220 | { | ||
2221 | n = negative; | ||
2222 | #ifndef QT_NO_REGEXP_OPTIM | ||
2223 | occ1.fill( 0, NumBadChars ); | ||
2224 | #endif | ||
2225 | } | ||
2226 | |||
2227 | void QRegExpEngine::CharClass::addCategories( int cats ) | ||
2228 | { | ||
2229 | c |= cats; | ||
2230 | #ifndef QT_NO_REGEXP_OPTIM | ||
2231 | occ1.fill( 0, NumBadChars ); | ||
2232 | #endif | ||
2233 | } | ||
2234 | |||
2235 | void QRegExpEngine::CharClass::addRange( ushort from, ushort to ) | ||
2236 | { | ||
2237 | if ( from > to ) | ||
2238 | qSwap( from, to ); | ||
2239 | int m = r.size(); | ||
2240 | r.resize( m + 1 ); | ||
2241 | r[m].from = from; | ||
2242 | r[m].to = to; | ||
2243 | |||
2244 | #ifndef QT_NO_REGEXP_OPTIM | ||
2245 | int i; | ||
2246 | |||
2247 | if ( to - from < NumBadChars ) { | ||
2248 | occ1.detach(); | ||
2249 | if ( from % NumBadChars <= to % NumBadChars ) { | ||
2250 | for ( i = from % NumBadChars; i <= to % NumBadChars; i++ ) | ||
2251 | occ1[i] = 0; | ||
2252 | } else { | ||
2253 | for ( i = 0; i <= to % NumBadChars; i++ ) | ||
2254 | occ1[i] = 0; | ||
2255 | for ( i = from % NumBadChars; i < NumBadChars; i++ ) | ||
2256 | occ1[i] = 0; | ||
2257 | } | ||
2258 | } else { | ||
2259 | occ1.fill( 0, NumBadChars ); | ||
2260 | } | ||
2261 | #endif | ||
2262 | } | ||
2263 | |||
2264 | bool QRegExpEngine::CharClass::in( QChar ch ) const | ||
2265 | { | ||
2266 | #ifndef QT_NO_REGEXP_OPTIM | ||
2267 | if ( occ1[BadChar(ch)] == NoOccurrence ) | ||
2268 | return n; | ||
2269 | #endif | ||
2270 | |||
2271 | if ( c != 0 && (c & (1 << (int) ch.category())) != 0 ) | ||
2272 | return !n; | ||
2273 | for ( int i = 0; i < (int) r.size(); i++ ) { | ||
2274 | if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to ) | ||
2275 | return !n; | ||
2276 | } | ||
2277 | return n; | ||
2278 | } | ||
2279 | |||
2280 | #if defined(QT_DEBUG) | ||
2281 | void QRegExpEngine::CharClass::dump() const | ||
2282 | { | ||
2283 | int i; | ||
2284 | qDebug( " %stive character class", n ? "nega" : "posi" ); | ||
2285 | #ifndef QT_NO_REGEXP_CCLASS | ||
2286 | if ( c != 0 ) | ||
2287 | qDebug( " categories 0x%.8x", c ); | ||
2288 | #endif | ||
2289 | for ( i = 0; i < (int) r.size(); i++ ) | ||
2290 | qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to ); | ||
2291 | } | ||
2292 | #endif | ||
2293 | #endif | ||
2294 | |||
2295 | QRegExpEngine::Box::Box( QRegExpEngine *engine ) | ||
2296 | : eng( engine ), skipanchors( 0 ) | ||
2297 | #ifndef QT_NO_REGEXP_OPTIM | ||
2298 | , earlyStart( 0 ), lateStart( 0 ), maxl( 0 ) | ||
2299 | #endif | ||
2300 | { | ||
2301 | #ifndef QT_NO_REGEXP_OPTIM | ||
2302 | occ1.fill( NoOccurrence, NumBadChars ); | ||
2303 | #endif | ||
2304 | minl = 0; | ||
2305 | } | ||
2306 | |||
2307 | QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b ) | ||
2308 | { | ||
2309 | eng = b.eng; | ||
2310 | ls = b.ls; | ||
2311 | rs = b.rs; | ||
2312 | lanchors = b.lanchors; | ||
2313 | ranchors = b.ranchors; | ||
2314 | skipanchors = b.skipanchors; | ||
2315 | #ifndef QT_NO_REGEXP_OPTIM | ||
2316 | earlyStart = b.earlyStart; | ||
2317 | lateStart = b.lateStart; | ||
2318 | str = b.str; | ||
2319 | leftStr = b.leftStr; | ||
2320 | rightStr = b.rightStr; | ||
2321 | maxl = b.maxl; | ||
2322 | occ1 = b.occ1; | ||
2323 | #endif | ||
2324 | minl = b.minl; | ||
2325 | return *this; | ||
2326 | } | ||
2327 | |||
2328 | void QRegExpEngine::Box::set( QChar ch ) | ||
2329 | { | ||
2330 | ls.resize( 1 ); | ||
2331 | ls[0] = eng->createState( ch ); | ||
2332 | rs = ls; | ||
2333 | rs.detach(); | ||
2334 | #ifndef QT_NO_REGEXP_OPTIM | ||
2335 | str = ch; | ||
2336 | leftStr = ch; | ||
2337 | rightStr = ch; | ||
2338 | maxl = 1; | ||
2339 | occ1.detach(); | ||
2340 | occ1[BadChar(ch)] = 0; | ||
2341 | #endif | ||
2342 | minl = 1; | ||
2343 | } | ||
2344 | |||
2345 | void QRegExpEngine::Box::set( const CharClass& cc ) | ||
2346 | { | ||
2347 | ls.resize( 1 ); | ||
2348 | ls[0] = eng->createState( cc ); | ||
2349 | rs = ls; | ||
2350 | rs.detach(); | ||
2351 | #ifndef QT_NO_REGEXP_OPTIM | ||
2352 | maxl = 1; | ||
2353 | occ1 = cc.firstOccurrence(); | ||
2354 | #endif | ||
2355 | minl = 1; | ||
2356 | } | ||
2357 | |||
2358 | #ifndef QT_NO_REGEXP_BACKREF | ||
2359 | void QRegExpEngine::Box::set( int bref ) | ||
2360 | { | ||
2361 | ls.resize( 1 ); | ||
2362 | ls[0] = eng->createState( bref ); | ||
2363 | rs = ls; | ||
2364 | rs.detach(); | ||
2365 | if ( bref >= 1 && bref <= MaxBackRefs ) | ||
2366 | skipanchors = Anchor_BackRef0Empty << bref; | ||
2367 | #ifndef QT_NO_REGEXP_OPTIM | ||
2368 | maxl = InftyLen; | ||
2369 | #endif | ||
2370 | minl = 0; | ||
2371 | } | ||
2372 | #endif | ||
2373 | |||
2374 | void QRegExpEngine::Box::cat( const Box& b ) | ||
2375 | { | ||
2376 | eng->addCatTransitions( rs, b.ls ); | ||
2377 | addAnchorsToEngine( b ); | ||
2378 | if ( minl == 0 ) { | ||
2379 | mergeInto( &lanchors, b.lanchors ); | ||
2380 | if ( skipanchors != 0 ) { | ||
2381 | for ( int i = 0; i < (int) b.ls.size(); i++ ) { | ||
2382 | int a = eng->anchorConcatenation( at(lanchors, b.ls[i]), | ||
2383 | skipanchors ); | ||
2384 | lanchors.insert( b.ls[i], a ); | ||
2385 | } | ||
2386 | } | ||
2387 | mergeInto( &ls, b.ls ); | ||
2388 | } | ||
2389 | if ( b.minl == 0 ) { | ||
2390 | mergeInto( &ranchors, b.ranchors ); | ||
2391 | if ( b.skipanchors != 0 ) { | ||
2392 | for ( int i = 0; i < (int) rs.size(); i++ ) { | ||
2393 | int a = eng->anchorConcatenation( at(ranchors, rs[i]), | ||
2394 | b.skipanchors ); | ||
2395 | ranchors.insert( rs[i], a ); | ||
2396 | } | ||
2397 | } | ||
2398 | mergeInto( &rs, b.rs ); | ||
2399 | } else { | ||
2400 | ranchors = b.ranchors; | ||
2401 | rs = b.rs; | ||
2402 | } | ||
2403 | |||
2404 | #ifndef QT_NO_REGEXP_OPTIM | ||
2405 | if ( maxl != InftyLen ) { | ||
2406 | if ( rightStr.length() + b.leftStr.length() > | ||
2407 | QMAX(str.length(), b.str.length()) ) { | ||
2408 | earlyStart = minl - rightStr.length(); | ||
2409 | lateStart = maxl - rightStr.length(); | ||
2410 | str = rightStr + b.leftStr; | ||
2411 | } else if ( b.str.length() > str.length() ) { | ||
2412 | earlyStart = minl + b.earlyStart; | ||
2413 | lateStart = maxl + b.lateStart; | ||
2414 | str = b.str; | ||
2415 | } | ||
2416 | } | ||
2417 | |||
2418 | if ( (int) leftStr.length() == maxl ) | ||
2419 | leftStr += b.leftStr; | ||
2420 | if ( (int) b.rightStr.length() == b.maxl ) | ||
2421 | rightStr += b.rightStr; | ||
2422 | else | ||
2423 | rightStr = b.rightStr; | ||
2424 | |||
2425 | if ( maxl == InftyLen || b.maxl == InftyLen ) | ||
2426 | maxl = InftyLen; | ||
2427 | else | ||
2428 | maxl += b.maxl; | ||
2429 | |||
2430 | occ1.detach(); | ||
2431 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2432 | if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] ) | ||
2433 | occ1[i] = minl + b.occ1[i]; | ||
2434 | } | ||
2435 | #endif | ||
2436 | |||
2437 | minl += b.minl; | ||
2438 | if ( minl == 0 ) | ||
2439 | skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors ); | ||
2440 | else | ||
2441 | skipanchors = 0; | ||
2442 | } | ||
2443 | |||
2444 | void QRegExpEngine::Box::orx( const Box& b ) | ||
2445 | { | ||
2446 | mergeInto( &ls, b.ls ); | ||
2447 | mergeInto( &lanchors, b.lanchors ); | ||
2448 | mergeInto( &rs, b.rs ); | ||
2449 | mergeInto( &ranchors, b.ranchors ); | ||
2450 | |||
2451 | if ( b.minl == 0 ) { | ||
2452 | if ( minl == 0 ) | ||
2453 | skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors ); | ||
2454 | else | ||
2455 | skipanchors = b.skipanchors; | ||
2456 | } | ||
2457 | |||
2458 | #ifndef QT_NO_REGEXP_OPTIM | ||
2459 | occ1.detach(); | ||
2460 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2461 | if ( occ1[i] > b.occ1[i] ) | ||
2462 | occ1[i] = b.occ1[i]; | ||
2463 | } | ||
2464 | earlyStart = 0; | ||
2465 | lateStart = 0; | ||
2466 | str = QString(); | ||
2467 | leftStr = QString(); | ||
2468 | rightStr = QString(); | ||
2469 | if ( b.maxl > maxl ) | ||
2470 | maxl = b.maxl; | ||
2471 | #endif | ||
2472 | if ( b.minl < minl ) | ||
2473 | minl = b.minl; | ||
2474 | } | ||
2475 | |||
2476 | void QRegExpEngine::Box::plus( int atom ) | ||
2477 | { | ||
2478 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2479 | eng->addPlusTransitions( rs, ls, atom ); | ||
2480 | #else | ||
2481 | Q_UNUSED( atom ); | ||
2482 | eng->addCatTransitions( rs, ls ); | ||
2483 | #endif | ||
2484 | addAnchorsToEngine( *this ); | ||
2485 | #ifndef QT_NO_REGEXP_OPTIM | ||
2486 | maxl = InftyLen; | ||
2487 | #endif | ||
2488 | } | ||
2489 | |||
2490 | void QRegExpEngine::Box::opt() | ||
2491 | { | ||
2492 | #ifndef QT_NO_REGEXP_OPTIM | ||
2493 | earlyStart = 0; | ||
2494 | lateStart = 0; | ||
2495 | str = QString(); | ||
2496 | leftStr = QString(); | ||
2497 | rightStr = QString(); | ||
2498 | #endif | ||
2499 | skipanchors = 0; | ||
2500 | minl = 0; | ||
2501 | } | ||
2502 | |||
2503 | void QRegExpEngine::Box::catAnchor( int a ) | ||
2504 | { | ||
2505 | if ( a != 0 ) { | ||
2506 | for ( int i = 0; i < (int) rs.size(); i++ ) { | ||
2507 | a = eng->anchorConcatenation( at(ranchors, rs[i]), a ); | ||
2508 | ranchors.insert( rs[i], a ); | ||
2509 | } | ||
2510 | if ( minl == 0 ) | ||
2511 | skipanchors = eng->anchorConcatenation( skipanchors, a ); | ||
2512 | } | ||
2513 | } | ||
2514 | |||
2515 | #ifndef QT_NO_REGEXP_OPTIM | ||
2516 | void QRegExpEngine::Box::setupHeuristics() | ||
2517 | { | ||
2518 | eng->setupGoodStringHeuristic( earlyStart, lateStart, str ); | ||
2519 | |||
2520 | /* | ||
2521 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = | ||
2522 | 1 at this point. An entry of occ1 has to be at most minl or | ||
2523 | infinity for the rest of the algorithm to go well. | ||
2524 | |||
2525 | We waited until here before normalizing these cases (instead of | ||
2526 | doing it in Box::orx()) because sometimes things improve by | ||
2527 | themselves. Consider for example (112|1)34. | ||
2528 | */ | ||
2529 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2530 | if ( occ1[i] != NoOccurrence && occ1[i] >= minl ) | ||
2531 | occ1[i] = minl; | ||
2532 | } | ||
2533 | eng->setupBadCharHeuristic( minl, occ1 ); | ||
2534 | |||
2535 | eng->heuristicallyChooseHeuristic(); | ||
2536 | } | ||
2537 | #endif | ||
2538 | |||
2539 | #if defined(QT_DEBUG) | ||
2540 | void QRegExpEngine::Box::dump() const | ||
2541 | { | ||
2542 | int i; | ||
2543 | qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" ); | ||
2544 | qDebug( " Left states:" ); | ||
2545 | for ( i = 0; i < (int) ls.size(); i++ ) { | ||
2546 | if ( at(lanchors, ls[i]) == 0 ) | ||
2547 | qDebug( " %d", ls[i] ); | ||
2548 | else | ||
2549 | qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] ); | ||
2550 | } | ||
2551 | qDebug( " Right states:" ); | ||
2552 | for ( i = 0; i < (int) rs.size(); i++ ) { | ||
2553 | if ( at(ranchors, rs[i]) == 0 ) | ||
2554 | qDebug( " %d", rs[i] ); | ||
2555 | else | ||
2556 | qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] ); | ||
2557 | } | ||
2558 | qDebug( " Skip anchors: 0x%.8x", skipanchors ); | ||
2559 | } | ||
2560 | #endif | ||
2561 | |||
2562 | void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const | ||
2563 | { | ||
2564 | for ( int i = 0; i < (int) to.ls.size(); i++ ) { | ||
2565 | for ( int j = 0; j < (int) rs.size(); j++ ) { | ||
2566 | int a = eng->anchorConcatenation( at(ranchors, rs[j]), | ||
2567 | at(to.lanchors, to.ls[i]) ); | ||
2568 | eng->addAnchors( rs[j], to.ls[i], a ); | ||
2569 | } | ||
2570 | } | ||
2571 | } | ||
2572 | |||
2573 | int QRegExpEngine::getChar() | ||
2574 | { | ||
2575 | return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode(); | ||
2576 | } | ||
2577 | |||
2578 | int QRegExpEngine::getEscape() | ||
2579 | { | ||
2580 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2581 | const char tab[] = "afnrtv"; // no b, as \b means word boundary | ||
2582 | const char backTab[] = "\a\f\n\r\t\v"; | ||
2583 | ushort low; | ||
2584 | int i; | ||
2585 | #endif | ||
2586 | ushort val; | ||
2587 | int prevCh = yyCh; | ||
2588 | |||
2589 | if ( prevCh == EOS ) { | ||
2590 | error( RXERR_END ); | ||
2591 | return Tok_Char | '\\'; | ||
2592 | } | ||
2593 | yyCh = getChar(); | ||
2594 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2595 | if ( (prevCh & ~0xff) == 0 ) { | ||
2596 | const char *p = strchr( tab, prevCh ); | ||
2597 | if ( p != 0 ) | ||
2598 | return Tok_Char | backTab[p - tab]; | ||
2599 | } | ||
2600 | #endif | ||
2601 | |||
2602 | switch ( prevCh ) { | ||
2603 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2604 | case '0': | ||
2605 | val = 0; | ||
2606 | for ( i = 0; i < 3; i++ ) { | ||
2607 | if ( yyCh >= '0' && yyCh <= '7' ) | ||
2608 | val = ( val << 3 ) | ( yyCh - '0' ); | ||
2609 | else | ||
2610 | break; | ||
2611 | yyCh = getChar(); | ||
2612 | } | ||
2613 | if ( (val & ~0377) != 0 ) | ||
2614 | error( RXERR_OCTAL ); | ||
2615 | return Tok_Char | val; | ||
2616 | #endif | ||
2617 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2618 | case 'B': | ||
2619 | return Tok_NonWord; | ||
2620 | #endif | ||
2621 | #ifndef QT_NO_REGEXP_CCLASS | ||
2622 | case 'D': | ||
2623 | // see QChar::isDigit() | ||
2624 | yyCharClass->addCategories( 0x7fffffef ); | ||
2625 | return Tok_CharClass; | ||
2626 | case 'S': | ||
2627 | // see QChar::isSpace() | ||
2628 | yyCharClass->addCategories( 0x7ffff87f ); | ||
2629 | yyCharClass->addRange( 0x0000, 0x0008 ); | ||
2630 | yyCharClass->addRange( 0x000e, 0x001f ); | ||
2631 | yyCharClass->addRange( 0x007f, 0x009f ); | ||
2632 | return Tok_CharClass; | ||
2633 | case 'W': | ||
2634 | // see QChar::isLetterOrNumber() | ||
2635 | yyCharClass->addCategories( 0x7ff07f8f ); | ||
2636 | return Tok_CharClass; | ||
2637 | #endif | ||
2638 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2639 | case 'b': | ||
2640 | return Tok_Word; | ||
2641 | #endif | ||
2642 | #ifndef QT_NO_REGEXP_CCLASS | ||
2643 | case 'd': | ||
2644 | // see QChar::isDigit() | ||
2645 | yyCharClass->addCategories( 0x00000010 ); | ||
2646 | return Tok_CharClass; | ||
2647 | case 's': | ||
2648 | // see QChar::isSpace() | ||
2649 | yyCharClass->addCategories( 0x00000380 ); | ||
2650 | yyCharClass->addRange( 0x0009, 0x000d ); | ||
2651 | return Tok_CharClass; | ||
2652 | case 'w': | ||
2653 | // see QChar::isLetterOrNumber() | ||
2654 | yyCharClass->addCategories( 0x000f8070 ); | ||
2655 | return Tok_CharClass; | ||
2656 | #endif | ||
2657 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2658 | case 'x': | ||
2659 | val = 0; | ||
2660 | for ( i = 0; i < 4; i++ ) { | ||
2661 | low = QChar( yyCh ).lower(); | ||
2662 | if ( low >= '0' && low <= '9' ) | ||
2663 | val = ( val << 4 ) | ( low - '0' ); | ||
2664 | else if ( low >= 'a' && low <= 'f' ) | ||
2665 | val = ( val << 4 ) | ( low - 'a' + 10 ); | ||
2666 | else | ||
2667 | break; | ||
2668 | yyCh = getChar(); | ||
2669 | } | ||
2670 | return Tok_Char | val; | ||
2671 | #endif | ||
2672 | default: | ||
2673 | if ( prevCh >= '1' && prevCh <= '9' ) { | ||
2674 | #ifndef QT_NO_REGEXP_BACKREF | ||
2675 | val = prevCh - '0'; | ||
2676 | while ( yyCh >= '0' && yyCh <= '9' ) { | ||
2677 | val = ( val *= 10 ) | ( yyCh - '0' ); | ||
2678 | yyCh = getChar(); | ||
2679 | } | ||
2680 | return Tok_BackRef | val; | ||
2681 | #else | ||
2682 | error( RXERR_DISABLED ); | ||
2683 | #endif | ||
2684 | } | ||
2685 | return Tok_Char | prevCh; | ||
2686 | } | ||
2687 | } | ||
2688 | |||
2689 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2690 | int QRegExpEngine::getRep( int def ) | ||
2691 | { | ||
2692 | if ( yyCh >= '0' && yyCh <= '9' ) { | ||
2693 | int rep = 0; | ||
2694 | do { | ||
2695 | rep = 10 * rep + yyCh - '0'; | ||
2696 | if ( rep >= InftyRep ) { | ||
2697 | error( RXERR_REPETITION ); | ||
2698 | rep = def; | ||
2699 | } | ||
2700 | yyCh = getChar(); | ||
2701 | } while ( yyCh >= '0' && yyCh <= '9' ); | ||
2702 | return rep; | ||
2703 | } else { | ||
2704 | return def; | ||
2705 | } | ||
2706 | } | ||
2707 | #endif | ||
2708 | |||
2709 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2710 | void QRegExpEngine::skipChars( int n ) | ||
2711 | { | ||
2712 | if ( n > 0 ) { | ||
2713 | yyPos += n - 1; | ||
2714 | yyCh = getChar(); | ||
2715 | } | ||
2716 | } | ||
2717 | #endif | ||
2718 | |||
2719 | void QRegExpEngine::error( const char *msg ) | ||
2720 | { | ||
2721 | if ( yyError.isEmpty() ) | ||
2722 | yyError = QString::fromLatin1( msg ); | ||
2723 | } | ||
2724 | |||
2725 | void QRegExpEngine::startTokenizer( const QChar *rx, int len ) | ||
2726 | { | ||
2727 | yyIn = rx; | ||
2728 | yyPos0 = 0; | ||
2729 | yyPos = 0; | ||
2730 | yyLen = len; | ||
2731 | yyCh = getChar(); | ||
2732 | yyCharClass = new CharClass; | ||
2733 | yyMinRep = 0; | ||
2734 | yyMaxRep = 0; | ||
2735 | yyError = QString(); | ||
2736 | } | ||
2737 | |||
2738 | int QRegExpEngine::getToken() | ||
2739 | { | ||
2740 | #ifndef QT_NO_REGEXP_CCLASS | ||
2741 | ushort pendingCh = 0; | ||
2742 | bool charPending; | ||
2743 | bool rangePending; | ||
2744 | int tok; | ||
2745 | #endif | ||
2746 | int prevCh = yyCh; | ||
2747 | |||
2748 | yyPos0 = yyPos - 1; | ||
2749 | #ifndef QT_NO_REGEXP_CCLASS | ||
2750 | yyCharClass->clear(); | ||
2751 | #endif | ||
2752 | yyMinRep = 0; | ||
2753 | yyMaxRep = 0; | ||
2754 | yyCh = getChar(); | ||
2755 | switch ( prevCh ) { | ||
2756 | case EOS: | ||
2757 | yyPos0 = yyPos; | ||
2758 | return Tok_Eos; | ||
2759 | case '$': | ||
2760 | return Tok_Dollar; | ||
2761 | case '(': | ||
2762 | if ( yyCh == '?' ) { | ||
2763 | prevCh = getChar(); | ||
2764 | yyCh = getChar(); | ||
2765 | switch ( prevCh ) { | ||
2766 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2767 | case '!': | ||
2768 | return Tok_NegLookahead; | ||
2769 | case '=': | ||
2770 | return Tok_PosLookahead; | ||
2771 | #endif | ||
2772 | case ':': | ||
2773 | return Tok_MagicLeftParen; | ||
2774 | default: | ||
2775 | error( RXERR_LOOKAHEAD ); | ||
2776 | return Tok_MagicLeftParen; | ||
2777 | } | ||
2778 | } else { | ||
2779 | return Tok_LeftParen; | ||
2780 | } | ||
2781 | case ')': | ||
2782 | return Tok_RightParen; | ||
2783 | case '*': | ||
2784 | yyMinRep = 0; | ||
2785 | yyMaxRep = InftyRep; | ||
2786 | return Tok_Quantifier; | ||
2787 | case '+': | ||
2788 | yyMinRep = 1; | ||
2789 | yyMaxRep = InftyRep; | ||
2790 | return Tok_Quantifier; | ||
2791 | case '.': | ||
2792 | #ifndef QT_NO_REGEXP_CCLASS | ||
2793 | yyCharClass->setNegative( TRUE ); | ||
2794 | #endif | ||
2795 | return Tok_CharClass; | ||
2796 | case '?': | ||
2797 | yyMinRep = 0; | ||
2798 | yyMaxRep = 1; | ||
2799 | return Tok_Quantifier; | ||
2800 | case '[': | ||
2801 | #ifndef QT_NO_REGEXP_CCLASS | ||
2802 | if ( yyCh == '^' ) { | ||
2803 | yyCharClass->setNegative( TRUE ); | ||
2804 | yyCh = getChar(); | ||
2805 | } | ||
2806 | charPending = FALSE; | ||
2807 | rangePending = FALSE; | ||
2808 | do { | ||
2809 | if ( yyCh == '-' && charPending && !rangePending ) { | ||
2810 | rangePending = TRUE; | ||
2811 | yyCh = getChar(); | ||
2812 | } else { | ||
2813 | if ( charPending && !rangePending ) { | ||
2814 | yyCharClass->addSingleton( pendingCh ); | ||
2815 | charPending = FALSE; | ||
2816 | } | ||
2817 | if ( yyCh == '\\' ) { | ||
2818 | yyCh = getChar(); | ||
2819 | tok = getEscape(); | ||
2820 | if ( tok == Tok_Word ) | ||
2821 | tok = '\b'; | ||
2822 | } else { | ||
2823 | tok = Tok_Char | yyCh; | ||
2824 | yyCh = getChar(); | ||
2825 | } | ||
2826 | if ( tok == Tok_CharClass ) { | ||
2827 | if ( rangePending ) { | ||
2828 | yyCharClass->addSingleton( '-' ); | ||
2829 | yyCharClass->addSingleton( pendingCh ); | ||
2830 | charPending = FALSE; | ||
2831 | rangePending = FALSE; | ||
2832 | } | ||
2833 | } else if ( (tok & Tok_Char) != 0 ) { | ||
2834 | if ( rangePending ) { | ||
2835 | yyCharClass->addRange( pendingCh, tok ^ Tok_Char ); | ||
2836 | charPending = FALSE; | ||
2837 | rangePending = FALSE; | ||
2838 | } else { | ||
2839 | pendingCh = tok ^ Tok_Char; | ||
2840 | charPending = TRUE; | ||
2841 | } | ||
2842 | } else { | ||
2843 | error( RXERR_CHARCLASS ); | ||
2844 | } | ||
2845 | } | ||
2846 | } while ( yyCh != ']' && yyCh != EOS ); | ||
2847 | if ( rangePending ) | ||
2848 | yyCharClass->addSingleton( '-' ); | ||
2849 | if ( charPending ) | ||
2850 | yyCharClass->addSingleton( pendingCh ); | ||
2851 | if ( yyCh == EOS ) | ||
2852 | error( RXERR_END ); | ||
2853 | else | ||
2854 | yyCh = getChar(); | ||
2855 | return Tok_CharClass; | ||
2856 | #else | ||
2857 | error( RXERR_END ); | ||
2858 | return Tok_Char | '['; | ||
2859 | #endif | ||
2860 | case '\\': | ||
2861 | return getEscape(); | ||
2862 | case ']': | ||
2863 | error( RXERR_LEFTDELIM ); | ||
2864 | return Tok_Char | ']'; | ||
2865 | case '^': | ||
2866 | return Tok_Caret; | ||
2867 | case '{': | ||
2868 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2869 | yyMinRep = getRep( 0 ); | ||
2870 | yyMaxRep = yyMinRep; | ||
2871 | if ( yyCh == ',' ) { | ||
2872 | yyCh = getChar(); | ||
2873 | yyMaxRep = getRep( InftyRep ); | ||
2874 | } | ||
2875 | if ( yyMaxRep < yyMinRep ) | ||
2876 | qSwap( yyMinRep, yyMaxRep ); | ||
2877 | if ( yyCh != '}' ) | ||
2878 | error( RXERR_REPETITION ); | ||
2879 | yyCh = getChar(); | ||
2880 | return Tok_Quantifier; | ||
2881 | #else | ||
2882 | error( RXERR_DISABLED ); | ||
2883 | return Tok_Char | '{'; | ||
2884 | #endif | ||
2885 | case '|': | ||
2886 | return Tok_Bar; | ||
2887 | case '}': | ||
2888 | error( RXERR_LEFTDELIM ); | ||
2889 | return Tok_Char | '}'; | ||
2890 | default: | ||
2891 | return Tok_Char | prevCh; | ||
2892 | } | ||
2893 | } | ||
2894 | |||
2895 | int QRegExpEngine::parse( const QChar *pattern, int len ) | ||
2896 | { | ||
2897 | valid = TRUE; | ||
2898 | startTokenizer( pattern, len ); | ||
2899 | yyTok = getToken(); | ||
2900 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2901 | yyMayCapture = TRUE; | ||
2902 | #else | ||
2903 | yyMayCapture = FALSE; | ||
2904 | #endif | ||
2905 | |||
2906 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2907 | int atom = startAtom( FALSE ); | ||
2908 | #endif | ||
2909 | CharClass anything; | ||
2910 | Box box( this ); // create InitialState | ||
2911 | box.set( anything ); | ||
2912 | Box rightBox( this ); // create FinalState | ||
2913 | rightBox.set( anything ); | ||
2914 | |||
2915 | Box middleBox( this ); | ||
2916 | parseExpression( &middleBox ); | ||
2917 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2918 | finishAtom( atom ); | ||
2919 | #endif | ||
2920 | #ifndef QT_NO_REGEXP_OPTIM | ||
2921 | middleBox.setupHeuristics(); | ||
2922 | #endif | ||
2923 | box.cat( middleBox ); | ||
2924 | box.cat( rightBox ); | ||
2925 | delete yyCharClass; | ||
2926 | yyCharClass = 0; | ||
2927 | |||
2928 | officialncap = ncap; | ||
2929 | #ifndef QT_NO_REGEXP_BACKREF | ||
2930 | if ( nbrefs > ncap ) | ||
2931 | ncap = nbrefs; | ||
2932 | #endif | ||
2933 | |||
2934 | mmCaptured.resize( 2 + 2 * officialncap ); | ||
2935 | mmCapturedNoMatch.fill( -1, 2 + 2 * officialncap ); | ||
2936 | |||
2937 | /* | ||
2938 | We use one QMemArray<int> for all the big data used a lot in | ||
2939 | matchHere() and friends. | ||
2940 | */ | ||
2941 | #ifndef QT_NO_REGEXP_OPTIM | ||
2942 | mmSlideTabSize = QMAX( minl + 1, 16 ); | ||
2943 | #else | ||
2944 | mmSlideTabSize = 0; | ||
2945 | #endif | ||
2946 | mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize ); | ||
2947 | |||
2948 | mmInNextStack = mmBigArray.data(); | ||
2949 | memset( mmInNextStack, -1, ns * sizeof(int) ); | ||
2950 | mmCurStack = mmInNextStack + ns; | ||
2951 | mmNextStack = mmInNextStack + 2 * ns; | ||
2952 | |||
2953 | mmCurCapBegin = mmInNextStack + 3 * ns; | ||
2954 | mmNextCapBegin = mmCurCapBegin + ncap * ns; | ||
2955 | mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns; | ||
2956 | mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns; | ||
2957 | |||
2958 | mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns; | ||
2959 | mmTempCapEnd = mmTempCapBegin + ncap; | ||
2960 | mmCapBegin = mmTempCapBegin + 2 * ncap; | ||
2961 | mmCapEnd = mmTempCapBegin + 3 * ncap; | ||
2962 | |||
2963 | mmSlideTab = mmTempCapBegin + 4 * ncap; | ||
2964 | |||
2965 | if ( !yyError.isEmpty() ) | ||
2966 | return -1; | ||
2967 | |||
2968 | #ifndef QT_NO_REGEXP_OPTIM | ||
2969 | State *sinit = s[InitialState]; | ||
2970 | caretAnchored = ( sinit->anchors != 0 ); | ||
2971 | if ( caretAnchored ) { | ||
2972 | QMap<int, int>& anchors = *sinit->anchors; | ||
2973 | QMap<int, int>::ConstIterator a; | ||
2974 | for ( a = anchors.begin(); a != anchors.end(); ++a ) { | ||
2975 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
2976 | if ( (*a & Anchor_Alternation) != 0 ) | ||
2977 | break; | ||
2978 | #endif | ||
2979 | if ( (*a & Anchor_Caret) == 0 ) { | ||
2980 | caretAnchored = FALSE; | ||
2981 | break; | ||
2982 | } | ||
2983 | } | ||
2984 | } | ||
2985 | #endif | ||
2986 | return yyPos0; | ||
2987 | } | ||
2988 | |||
2989 | void QRegExpEngine::parseAtom( Box *box ) | ||
2990 | { | ||
2991 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2992 | QRegExpEngine *eng = 0; | ||
2993 | bool neg; | ||
2994 | int len; | ||
2995 | #endif | ||
2996 | |||
2997 | switch ( yyTok ) { | ||
2998 | case Tok_Dollar: | ||
2999 | box->catAnchor( Anchor_Dollar ); | ||
3000 | break; | ||
3001 | case Tok_Caret: | ||
3002 | box->catAnchor( Anchor_Caret ); | ||
3003 | break; | ||
3004 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
3005 | case Tok_PosLookahead: | ||
3006 | case Tok_NegLookahead: | ||
3007 | neg = ( yyTok == Tok_NegLookahead ); | ||
3008 | eng = new QRegExpEngine( cs ); | ||
3009 | len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 ); | ||
3010 | if ( len >= 0 ) | ||
3011 | skipChars( len ); | ||
3012 | else | ||
3013 | error( RXERR_LOOKAHEAD ); | ||
3014 | box->catAnchor( addLookahead(eng, neg) ); | ||
3015 | yyTok = getToken(); | ||
3016 | if ( yyTok != Tok_RightParen ) | ||
3017 | error( RXERR_LOOKAHEAD ); | ||
3018 | break; | ||
3019 | #endif | ||
3020 | #ifndef QT_NO_REGEXP_ESCAPE | ||
3021 | case Tok_Word: | ||
3022 | box->catAnchor( Anchor_Word ); | ||
3023 | break; | ||
3024 | case Tok_NonWord: | ||
3025 | box->catAnchor( Anchor_NonWord ); | ||
3026 | break; | ||
3027 | #endif | ||
3028 | case Tok_LeftParen: | ||
3029 | case Tok_MagicLeftParen: | ||
3030 | yyTok = getToken(); | ||
3031 | parseExpression( box ); | ||
3032 | if ( yyTok != Tok_RightParen ) | ||
3033 | error( RXERR_END ); | ||
3034 | break; | ||
3035 | case Tok_CharClass: | ||
3036 | box->set( *yyCharClass ); | ||
3037 | break; | ||
3038 | case Tok_Quantifier: | ||
3039 | error( RXERR_REPETITION ); | ||
3040 | break; | ||
3041 | default: | ||
3042 | if ( (yyTok & Tok_Char) != 0 ) | ||
3043 | box->set( QChar(yyTok ^ Tok_Char) ); | ||
3044 | #ifndef QT_NO_REGEXP_BACKREF | ||
3045 | else if ( (yyTok & Tok_BackRef) != 0 ) | ||
3046 | box->set( yyTok ^ Tok_BackRef ); | ||
3047 | #endif | ||
3048 | else | ||
3049 | error( RXERR_DISABLED ); | ||
3050 | } | ||
3051 | yyTok = getToken(); | ||
3052 | } | ||
3053 | |||
3054 | void QRegExpEngine::parseFactor( Box *box ) | ||
3055 | { | ||
3056 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3057 | int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen ); | ||
3058 | #else | ||
3059 | static const int atom = 0; | ||
3060 | #endif | ||
3061 | |||
3062 | #ifndef QT_NO_REGEXP_INTERVAL | ||
3063 | #define YYREDO() \ | ||
3064 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ | ||
3065 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok | ||
3066 | |||
3067 | const QChar *in = yyIn; | ||
3068 | int pos0 = yyPos0; | ||
3069 | int pos = yyPos; | ||
3070 | int len = yyLen; | ||
3071 | int ch = yyCh; | ||
3072 | CharClass charClass; | ||
3073 | if ( yyTok == Tok_CharClass ) | ||
3074 | charClass = *yyCharClass; | ||
3075 | int tok = yyTok; | ||
3076 | bool mayCapture = yyMayCapture; | ||
3077 | #endif | ||
3078 | |||
3079 | parseAtom( box ); | ||
3080 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3081 | finishAtom( atom ); | ||
3082 | #endif | ||
3083 | |||
3084 | if ( yyTok == Tok_Quantifier ) { | ||
3085 | if ( yyMaxRep == InftyRep ) { | ||
3086 | box->plus( atom ); | ||
3087 | #ifndef QT_NO_REGEXP_INTERVAL | ||
3088 | } else if ( yyMaxRep == 0 ) { | ||
3089 | box->clear(); | ||
3090 | #endif | ||
3091 | } | ||
3092 | if ( yyMinRep == 0 ) | ||
3093 | box->opt(); | ||
3094 | |||
3095 | #ifndef QT_NO_REGEXP_INTERVAL | ||
3096 | yyMayCapture = FALSE; | ||
3097 | int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1; | ||
3098 | int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 ); | ||
3099 | |||
3100 | Box rightBox( this ); | ||
3101 | int i; | ||
3102 | |||
3103 | for ( i = 0; i < beta; i++ ) { | ||
3104 | YYREDO(); | ||
3105 | Box leftBox( this ); | ||
3106 | parseAtom( &leftBox ); | ||
3107 | leftBox.cat( rightBox ); | ||
3108 | leftBox.opt(); | ||
3109 | rightBox = leftBox; | ||
3110 | } | ||
3111 | for ( i = 0; i < alpha; i++ ) { | ||
3112 | YYREDO(); | ||
3113 | Box leftBox( this ); | ||
3114 | parseAtom( &leftBox ); | ||
3115 | leftBox.cat( rightBox ); | ||
3116 | rightBox = leftBox; | ||
3117 | } | ||
3118 | rightBox.cat( *box ); | ||
3119 | *box = rightBox; | ||
3120 | #endif | ||
3121 | yyTok = getToken(); | ||
3122 | #ifndef QT_NO_REGEXP_INTERVAL | ||
3123 | yyMayCapture = mayCapture; | ||
3124 | #endif | ||
3125 | } | ||
3126 | #undef YYREDO | ||
3127 | } | ||
3128 | |||
3129 | void QRegExpEngine::parseTerm( Box *box ) | ||
3130 | { | ||
3131 | #ifndef QT_NO_REGEXP_OPTIM | ||
3132 | if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) | ||
3133 | parseFactor( box ); | ||
3134 | #endif | ||
3135 | while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) { | ||
3136 | Box rightBox( this ); | ||
3137 | parseFactor( &rightBox ); | ||
3138 | box->cat( rightBox ); | ||
3139 | } | ||
3140 | } | ||
3141 | |||
3142 | void QRegExpEngine::parseExpression( Box *box ) | ||
3143 | { | ||
3144 | parseTerm( box ); | ||
3145 | while ( yyTok == Tok_Bar ) { | ||
3146 | Box rightBox( this ); | ||
3147 | yyTok = getToken(); | ||
3148 | parseTerm( &rightBox ); | ||
3149 | box->orx( rightBox ); | ||
3150 | } | ||
3151 | } | ||
3152 | |||
3153 | /* | ||
3154 | The struct QRegExpPrivate contains the private data of a regular | ||
3155 | expression other than the automaton. It makes it possible for many | ||
3156 | QRegExp objects to use the same QRegExpEngine object with different | ||
3157 | QRegExpPrivate objects. | ||
3158 | */ | ||
3159 | struct QRegExpPrivate | ||
3160 | { | ||
3161 | QString pattern; // regular-expression or wildcard pattern | ||
3162 | QString rxpattern; // regular-expression pattern | ||
3163 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3164 | bool wc; // wildcard mode? | ||
3165 | #endif | ||
3166 | bool min; // minimal matching? (instead of maximal) | ||
3167 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3168 | QString t; // last string passed to QRegExp::search() or searchRev() | ||
3169 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last | ||
3170 | #endif | ||
3171 | QMemArray<int> captured; // what QRegExpEngine::search() returned last | ||
3172 | |||
3173 | QRegExpPrivate() { captured.fill( -1, 2 ); } | ||
3174 | }; | ||
3175 | |||
3176 | #ifndef QT_NO_REGEXP_OPTIM | ||
3177 | static QCache<QRegExpEngine> *engineCache = 0; | ||
3178 | static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache; | ||
3179 | #endif | ||
3180 | |||
3181 | static QRegExpEngine *newEngine( const QString& pattern, bool caseSensitive ) | ||
3182 | { | ||
3183 | #ifndef QT_NO_REGEXP_OPTIM | ||
3184 | if ( engineCache != 0 ) { | ||
3185 | #ifdef QT_THREAD_SUPPORT | ||
3186 | QMutexLocker locker( qt_global_mutexpool->get( &engineCache ) ); | ||
3187 | #endif | ||
3188 | QRegExpEngine *eng = engineCache->take( pattern ); | ||
3189 | if ( eng == 0 || eng->caseSensitive() != caseSensitive ) { | ||
3190 | delete eng; | ||
3191 | } else { | ||
3192 | eng->ref(); | ||
3193 | return eng; | ||
3194 | } | ||
3195 | } | ||
3196 | #endif | ||
3197 | return new QRegExpEngine( pattern, caseSensitive ); | ||
3198 | } | ||
3199 | |||
3200 | static void derefEngine( QRegExpEngine *eng, const QString& pattern ) | ||
3201 | { | ||
3202 | if ( eng != 0 && eng->deref() ) { | ||
3203 | #ifndef QT_NO_REGEXP_OPTIM | ||
3204 | #ifdef QT_THREAD_SUPPORT | ||
3205 | QMutexLocker locker( qt_global_mutexpool->get( &engineCache ) ); | ||
3206 | #endif | ||
3207 | if ( engineCache == 0 ) { | ||
3208 | engineCache = new QCache<QRegExpEngine>; | ||
3209 | engineCache->setAutoDelete( TRUE ); | ||
3210 | cleanup_cache.set( &engineCache ); | ||
3211 | } | ||
3212 | if ( !pattern.isNull() && | ||
3213 | engineCache->insert(pattern, eng, 4 + pattern.length() / 4) ) | ||
3214 | return; | ||
3215 | #else | ||
3216 | Q_UNUSED( pattern ); | ||
3217 | #endif | ||
3218 | delete eng; | ||
3219 | } | ||
3220 | } | ||
3221 | |||
3222 | /*! | ||
3223 | \enum QRegExp::CaretMode | ||
3224 | |||
3225 | The CaretMode enum defines the different meanings of the caret | ||
3226 | (<b>^</b>) in a regular expression. The possible values are: | ||
3227 | |||
3228 | \value CaretAtZero | ||
3229 | The caret corresponds to index 0 in the searched string. | ||
3230 | |||
3231 | \value CaretAtOffset | ||
3232 | The caret corresponds to the start offset of the search. | ||
3233 | |||
3234 | \value CaretWontMatch | ||
3235 | The caret never matches. | ||
3236 | */ | ||
3237 | |||
3238 | /*! | ||
3239 | Constructs an empty regexp. | ||
3240 | |||
3241 | \sa isValid() errorString() | ||
3242 | */ | ||
3243 | QRegExp::QRegExp() | ||
3244 | { | ||
3245 | eng = new QRegExpEngine( TRUE ); | ||
3246 | priv = new QRegExpPrivate; | ||
3247 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3248 | priv->wc = FALSE; | ||
3249 | #endif | ||
3250 | priv->min = FALSE; | ||
3251 | compile( TRUE ); | ||
3252 | } | ||
3253 | |||
3254 | /*! | ||
3255 | Constructs a regular expression object for the given \a pattern | ||
3256 | string. The pattern must be given using wildcard notation if \a | ||
3257 | wildcard is TRUE (default is FALSE). The pattern is case | ||
3258 | sensitive, unless \a caseSensitive is FALSE. Matching is greedy | ||
3259 | (maximal), but can be changed by calling setMinimal(). | ||
3260 | |||
3261 | \sa setPattern() setCaseSensitive() setWildcard() setMinimal() | ||
3262 | */ | ||
3263 | QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard ) | ||
3264 | { | ||
3265 | eng = 0; | ||
3266 | priv = new QRegExpPrivate; | ||
3267 | priv->pattern = pattern; | ||
3268 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3269 | priv->wc = wildcard; | ||
3270 | #endif | ||
3271 | priv->min = FALSE; | ||
3272 | compile( caseSensitive ); | ||
3273 | } | ||
3274 | |||
3275 | /*! | ||
3276 | Constructs a regular expression as a copy of \a rx. | ||
3277 | |||
3278 | \sa operator=() | ||
3279 | */ | ||
3280 | QRegExp::QRegExp( const QRegExp& rx ) | ||
3281 | { | ||
3282 | eng = 0; | ||
3283 | priv = new QRegExpPrivate; | ||
3284 | operator=( rx ); | ||
3285 | } | ||
3286 | |||
3287 | /*! | ||
3288 | Destroys the regular expression and cleans up its internal data. | ||
3289 | */ | ||
3290 | QRegExp::~QRegExp() | ||
3291 | { | ||
3292 | derefEngine( eng, priv->rxpattern ); | ||
3293 | delete priv; | ||
3294 | } | ||
3295 | |||
3296 | /*! | ||
3297 | Copies the regular expression \a rx and returns a reference to the | ||
3298 | copy. The case sensitivity, wildcard and minimal matching options | ||
3299 | are also copied. | ||
3300 | */ | ||
3301 | QRegExp& QRegExp::operator=( const QRegExp& rx ) | ||
3302 | { | ||
3303 | rx.eng->ref(); | ||
3304 | derefEngine( eng, priv->rxpattern ); | ||
3305 | eng = rx.eng; | ||
3306 | priv->pattern = rx.priv->pattern; | ||
3307 | priv->rxpattern = rx.priv->rxpattern; | ||
3308 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3309 | priv->wc = rx.priv->wc; | ||
3310 | #endif | ||
3311 | priv->min = rx.priv->min; | ||
3312 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3313 | priv->t = rx.priv->t; | ||
3314 | priv->capturedCache = rx.priv->capturedCache; | ||
3315 | #endif | ||
3316 | priv->captured = rx.priv->captured; | ||
3317 | return *this; | ||
3318 | } | ||
3319 | |||
3320 | /*! | ||
3321 | Returns TRUE if this regular expression is equal to \a rx; | ||
3322 | otherwise returns FALSE. | ||
3323 | |||
3324 | Two QRegExp objects are equal if they have the same pattern | ||
3325 | strings and the same settings for case sensitivity, wildcard and | ||
3326 | minimal matching. | ||
3327 | */ | ||
3328 | bool QRegExp::operator==( const QRegExp& rx ) const | ||
3329 | { | ||
3330 | return priv->pattern == rx.priv->pattern && | ||
3331 | eng->caseSensitive() == rx.eng->caseSensitive() && | ||
3332 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3333 | priv->wc == rx.priv->wc && | ||
3334 | #endif | ||
3335 | priv->min == rx.priv->min; | ||
3336 | } | ||
3337 | |||
3338 | /*! | ||
3339 | \fn bool QRegExp::operator!=( const QRegExp& rx ) const | ||
3340 | |||
3341 | Returns TRUE if this regular expression is not equal to \a rx; | ||
3342 | otherwise returns FALSE. | ||
3343 | |||
3344 | \sa operator==() | ||
3345 | */ | ||
3346 | |||
3347 | /*! | ||
3348 | Returns TRUE if the pattern string is empty; otherwise returns | ||
3349 | FALSE. | ||
3350 | |||
3351 | If you call exactMatch() with an empty pattern on an empty string | ||
3352 | it will return TRUE; otherwise it returns FALSE since it operates | ||
3353 | over the whole string. If you call search() with an empty pattern | ||
3354 | on \e any string it will return the start offset (0 by default) | ||
3355 | because the empty pattern matches the 'emptiness' at the start of | ||
3356 | the string. In this case the length of the match returned by | ||
3357 | matchedLength() will be 0. | ||
3358 | |||
3359 | See QString::isEmpty(). | ||
3360 | */ | ||
3361 | |||
3362 | bool QRegExp::isEmpty() const | ||
3363 | { | ||
3364 | return priv->pattern.isEmpty(); | ||
3365 | } | ||
3366 | |||
3367 | /*! | ||
3368 | Returns TRUE if the regular expression is valid; otherwise returns | ||
3369 | FALSE. An invalid regular expression never matches. | ||
3370 | |||
3371 | The pattern <b>[a-z</b> is an example of an invalid pattern, since | ||
3372 | it lacks a closing square bracket. | ||
3373 | |||
3374 | Note that the validity of a regexp may also depend on the setting | ||
3375 | of the wildcard flag, for example <b>*.html</b> is a valid | ||
3376 | wildcard regexp but an invalid full regexp. | ||
3377 | |||
3378 | \sa errorString() | ||
3379 | */ | ||
3380 | bool QRegExp::isValid() const | ||
3381 | { | ||
3382 | return eng->isValid(); | ||
3383 | } | ||
3384 | |||
3385 | /*! | ||
3386 | Returns the pattern string of the regular expression. The pattern | ||
3387 | has either regular expression syntax or wildcard syntax, depending | ||
3388 | on wildcard(). | ||
3389 | |||
3390 | \sa setPattern() | ||
3391 | */ | ||
3392 | QString QRegExp::pattern() const | ||
3393 | { | ||
3394 | return priv->pattern; | ||
3395 | } | ||
3396 | |||
3397 | /*! | ||
3398 | Sets the pattern string to \a pattern. The case sensitivity, | ||
3399 | wildcard and minimal matching options are not changed. | ||
3400 | |||
3401 | \sa pattern() | ||
3402 | */ | ||
3403 | void QRegExp::setPattern( const QString& pattern ) | ||
3404 | { | ||
3405 | if ( priv->pattern != pattern ) { | ||
3406 | priv->pattern = pattern; | ||
3407 | compile( caseSensitive() ); | ||
3408 | } | ||
3409 | } | ||
3410 | |||
3411 | /*! | ||
3412 | Returns TRUE if case sensitivity is enabled; otherwise returns | ||
3413 | FALSE. The default is TRUE. | ||
3414 | |||
3415 | \sa setCaseSensitive() | ||
3416 | */ | ||
3417 | bool QRegExp::caseSensitive() const | ||
3418 | { | ||
3419 | return eng->caseSensitive(); | ||
3420 | } | ||
3421 | |||
3422 | /*! | ||
3423 | Sets case sensitive matching to \a sensitive. | ||
3424 | |||
3425 | If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but | ||
3426 | not \c{README.TXT}. | ||
3427 | |||
3428 | \sa caseSensitive() | ||
3429 | */ | ||
3430 | void QRegExp::setCaseSensitive( bool sensitive ) | ||
3431 | { | ||
3432 | if ( sensitive != eng->caseSensitive() ) | ||
3433 | compile( sensitive ); | ||
3434 | } | ||
3435 | |||
3436 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3437 | /*! | ||
3438 | Returns TRUE if wildcard mode is enabled; otherwise returns FALSE. | ||
3439 | The default is FALSE. | ||
3440 | |||
3441 | \sa setWildcard() | ||
3442 | */ | ||
3443 | bool QRegExp::wildcard() const | ||
3444 | { | ||
3445 | return priv->wc; | ||
3446 | } | ||
3447 | |||
3448 | /*! | ||
3449 | Sets the wildcard mode for the regular expression. The default is | ||
3450 | FALSE. | ||
3451 | |||
3452 | Setting \a wildcard to TRUE enables simple shell-like wildcard | ||
3453 | matching. (See \link #wildcard-matching wildcard matching | ||
3454 | (globbing) \endlink.) | ||
3455 | |||
3456 | For example, <b>r*.txt</b> matches the string \c{readme.txt} in | ||
3457 | wildcard mode, but does not match \c{readme}. | ||
3458 | |||
3459 | \sa wildcard() | ||
3460 | */ | ||
3461 | void QRegExp::setWildcard( bool wildcard ) | ||
3462 | { | ||
3463 | if ( wildcard != priv->wc ) { | ||
3464 | priv->wc = wildcard; | ||
3465 | compile( caseSensitive() ); | ||
3466 | } | ||
3467 | } | ||
3468 | #endif | ||
3469 | |||
3470 | /*! | ||
3471 | Returns TRUE if minimal (non-greedy) matching is enabled; | ||
3472 | otherwise returns FALSE. | ||
3473 | |||
3474 | \sa setMinimal() | ||
3475 | */ | ||
3476 | bool QRegExp::minimal() const | ||
3477 | { | ||
3478 | return priv->min; | ||
3479 | } | ||
3480 | |||
3481 | /*! | ||
3482 | Enables or disables minimal matching. If \a minimal is FALSE, | ||
3483 | matching is greedy (maximal) which is the default. | ||
3484 | |||
3485 | For example, suppose we have the input string "We must be | ||
3486 | \<b>bold\</b>, very \<b>bold\</b>!" and the pattern | ||
3487 | <b>\<b>.*\</b></b>. With the default greedy (maximal) matching, | ||
3488 | the match is "We must be <u>\<b>bold\</b>, very | ||
3489 | \<b>bold\</b></u>!". But with minimal (non-greedy) matching the | ||
3490 | first match is: "We must be <u>\<b>bold\</b></u>, very | ||
3491 | \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>, | ||
3492 | very <u>\<b>bold\</b></u>!". In practice we might use the pattern | ||
3493 | <b>\<b>[^\<]+\</b></b> instead, although this will still fail for | ||
3494 | nested tags. | ||
3495 | |||
3496 | \sa minimal() | ||
3497 | */ | ||
3498 | void QRegExp::setMinimal( bool minimal ) | ||
3499 | { | ||
3500 | priv->min = minimal; | ||
3501 | } | ||
3502 | |||
3503 | /*! | ||
3504 | Returns TRUE if \a str is matched exactly by this regular | ||
3505 | expression; otherwise returns FALSE. You can determine how much of | ||
3506 | the string was matched by calling matchedLength(). | ||
3507 | |||
3508 | For a given regexp string, R, exactMatch("R") is the equivalent of | ||
3509 | search("^R$") since exactMatch() effectively encloses the regexp | ||
3510 | in the start of string and end of string anchors, except that it | ||
3511 | sets matchedLength() differently. | ||
3512 | |||
3513 | For example, if the regular expression is <b>blue</b>, then | ||
3514 | exactMatch() returns TRUE only for input \c blue. For inputs \c | ||
3515 | bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE | ||
3516 | and matchedLength() will return 4, 3 and 0 respectively. | ||
3517 | |||
3518 | Although const, this function sets matchedLength(), | ||
3519 | capturedTexts() and pos(). | ||
3520 | |||
3521 | \sa search() searchRev() QRegExpValidator | ||
3522 | */ | ||
3523 | bool QRegExp::exactMatch( const QString& str ) const | ||
3524 | { | ||
3525 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3526 | priv->t = str; | ||
3527 | priv->capturedCache.clear(); | ||
3528 | #endif | ||
3529 | |||
3530 | priv->captured = eng->match( str, 0, priv->min, TRUE, 0 ); | ||
3531 | if ( priv->captured[1] == (int) str.length() ) { | ||
3532 | return TRUE; | ||
3533 | } else { | ||
3534 | priv->captured.detach(); | ||
3535 | priv->captured[0] = 0; | ||
3536 | priv->captured[1] = eng->matchedLength(); | ||
3537 | return FALSE; | ||
3538 | } | ||
3539 | } | ||
3540 | |||
3541 | #ifndef QT_NO_COMPAT | ||
3542 | /*! \obsolete | ||
3543 | |||
3544 | Attempts to match in \a str, starting from position \a index. | ||
3545 | Returns the position of the match, or -1 if there was no match. | ||
3546 | |||
3547 | The length of the match is stored in \a *len, unless \a len is a | ||
3548 | null pointer. | ||
3549 | |||
3550 | If \a indexIsStart is TRUE (the default), the position \a index in | ||
3551 | the string will match the start of string anchor, <b>^</b>, in the | ||
3552 | regexp, if present. Otherwise, position 0 in \a str will match. | ||
3553 | |||
3554 | Use search() and matchedLength() instead of this function. | ||
3555 | |||
3556 | \sa QString::mid() QConstString | ||
3557 | */ | ||
3558 | int QRegExp::match( const QString& str, int index, int *len, | ||
3559 | bool indexIsStart ) const | ||
3560 | { | ||
3561 | int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero ); | ||
3562 | if ( len != 0 ) | ||
3563 | *len = matchedLength(); | ||
3564 | return pos; | ||
3565 | } | ||
3566 | #endif // QT_NO_COMPAT | ||
3567 | |||
3568 | /*! | ||
3569 | \overload | ||
3570 | |||
3571 | This convenience function searches with a \c CaretMode of \c | ||
3572 | CaretAtZero which is the most common usage. | ||
3573 | */ | ||
3574 | |||
3575 | int QRegExp::search( const QString& str, int offset ) const | ||
3576 | { | ||
3577 | return search( str, offset, CaretAtZero ); | ||
3578 | } | ||
3579 | |||
3580 | /*! | ||
3581 | Attempts to find a match in \a str from position \a offset (0 by | ||
3582 | default). If \a offset is -1, the search starts at the last | ||
3583 | character; if -2, at the next to last character; etc. | ||
3584 | |||
3585 | Returns the position of the first match, or -1 if there was no | ||
3586 | match. | ||
3587 | |||
3588 | The \a caretMode parameter can be used to instruct whether <b>^</b> | ||
3589 | should match at index 0 or at \a offset. | ||
3590 | |||
3591 | You might prefer to use QString::find(), QString::contains() or | ||
3592 | even QStringList::grep(). To replace matches use | ||
3593 | QString::replace(). | ||
3594 | |||
3595 | Example: | ||
3596 | \code | ||
3597 | QString str = "offsets: 1.23 .50 71.00 6.00"; | ||
3598 | QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching | ||
3599 | int count = 0; | ||
3600 | int pos = 0; | ||
3601 | while ( (pos = rx.search(str, pos)) != -1 ) { | ||
3602 | count++; | ||
3603 | pos += rx.matchedLength(); | ||
3604 | } | ||
3605 | // pos will be 9, 14, 18 and finally 24; count will end up as 4 | ||
3606 | \endcode | ||
3607 | |||
3608 | Although const, this function sets matchedLength(), | ||
3609 | capturedTexts() and pos(). | ||
3610 | |||
3611 | \sa searchRev() exactMatch() | ||
3612 | */ | ||
3613 | |||
3614 | int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const | ||
3615 | { | ||
3616 | if ( offset < 0 ) | ||
3617 | offset += str.length(); | ||
3618 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3619 | priv->t = str; | ||
3620 | priv->capturedCache.clear(); | ||
3621 | #endif | ||
3622 | priv->captured = eng->match( str, offset, priv->min, FALSE, | ||
3623 | caretIndex(offset, caretMode) ); | ||
3624 | return priv->captured[0]; | ||
3625 | } | ||
3626 | |||
3627 | |||
3628 | /*! | ||
3629 | \overload | ||
3630 | |||
3631 | This convenience function searches with a \c CaretMode of \c | ||
3632 | CaretAtZero which is the most common usage. | ||
3633 | */ | ||
3634 | |||
3635 | int QRegExp::searchRev( const QString& str, int offset ) const | ||
3636 | { | ||
3637 | return searchRev( str, offset, CaretAtZero ); | ||
3638 | } | ||
3639 | |||
3640 | /*! | ||
3641 | Attempts to find a match backwards in \a str from position \a | ||
3642 | offset. If \a offset is -1 (the default), the search starts at the | ||
3643 | last character; if -2, at the next to last character; etc. | ||
3644 | |||
3645 | Returns the position of the first match, or -1 if there was no | ||
3646 | match. | ||
3647 | |||
3648 | The \a caretMode parameter can be used to instruct whether <b>^</b> | ||
3649 | should match at index 0 or at \a offset. | ||
3650 | |||
3651 | Although const, this function sets matchedLength(), | ||
3652 | capturedTexts() and pos(). | ||
3653 | |||
3654 | \warning Searching backwards is much slower than searching | ||
3655 | forwards. | ||
3656 | |||
3657 | \sa search() exactMatch() | ||
3658 | */ | ||
3659 | |||
3660 | int QRegExp::searchRev( const QString& str, int offset, | ||
3661 | CaretMode caretMode ) const | ||
3662 | { | ||
3663 | if ( offset < 0 ) | ||
3664 | offset += str.length(); | ||
3665 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3666 | priv->t = str; | ||
3667 | priv->capturedCache.clear(); | ||
3668 | #endif | ||
3669 | if ( offset < 0 || offset > (int) str.length() ) { | ||
3670 | priv->captured.detach(); | ||
3671 | priv->captured.fill( -1 ); | ||
3672 | return -1; | ||
3673 | } | ||
3674 | |||
3675 | while ( offset >= 0 ) { | ||
3676 | priv->captured = eng->match( str, offset, priv->min, TRUE, | ||
3677 | caretIndex(offset, caretMode) ); | ||
3678 | if ( priv->captured[0] == offset ) | ||
3679 | return offset; | ||
3680 | offset--; | ||
3681 | } | ||
3682 | return -1; | ||
3683 | } | ||
3684 | |||
3685 | /*! | ||
3686 | Returns the length of the last matched string, or -1 if there was | ||
3687 | no match. | ||
3688 | |||
3689 | \sa exactMatch() search() searchRev() | ||
3690 | */ | ||
3691 | int QRegExp::matchedLength() const | ||
3692 | { | ||
3693 | return priv->captured[1]; | ||
3694 | } | ||
3695 | |||
3696 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3697 | /*! | ||
3698 | Returns the number of captures contained in the regular expression. | ||
3699 | */ | ||
3700 | int QRegExp::numCaptures() const | ||
3701 | { | ||
3702 | return eng->numCaptures(); | ||
3703 | } | ||
3704 | |||
3705 | |||
3706 | |||
3707 | /*! | ||
3708 | Returns a list of the captured text strings. | ||
3709 | |||
3710 | The first string in the list is the entire matched string. Each | ||
3711 | subsequent list element contains a string that matched a | ||
3712 | (capturing) subexpression of the regexp. | ||
3713 | |||
3714 | For example: | ||
3715 | \code | ||
3716 | QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" ); | ||
3717 | int pos = rx.search( "Length: 36 inches" ); | ||
3718 | QStringList list = rx.capturedTexts(); | ||
3719 | // list is now ( "36 inches", "36", " ", "inches", "es" ) | ||
3720 | \endcode | ||
3721 | |||
3722 | The above example also captures elements that may be present but | ||
3723 | which we have no interest in. This problem can be solved by using | ||
3724 | non-capturing parentheses: | ||
3725 | |||
3726 | \code | ||
3727 | QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" ); | ||
3728 | int pos = rx.search( "Length: 36 inches" ); | ||
3729 | QStringList list = rx.capturedTexts(); | ||
3730 | // list is now ( "36 inches", "36", "inches" ) | ||
3731 | \endcode | ||
3732 | |||
3733 | Note that if you want to iterate over the list, you should iterate | ||
3734 | over a copy, e.g. | ||
3735 | \code | ||
3736 | QStringList list = rx.capturedTexts(); | ||
3737 | QStringList::Iterator it = list.begin(); | ||
3738 | while( it != list.end() ) { | ||
3739 | myProcessing( *it ); | ||
3740 | ++it; | ||
3741 | } | ||
3742 | \endcode | ||
3743 | |||
3744 | Some regexps can match an indeterminate number of times. For | ||
3745 | example if the input string is "Offsets: 12 14 99 231 7" and the | ||
3746 | regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of | ||
3747 | all the numbers matched. However, after calling | ||
3748 | \c{rx.search(str)}, capturedTexts() will return the list ( "12", | ||
3749 | "12" ), i.e. the entire match was "12" and the first subexpression | ||
3750 | matched was "12". The correct approach is to use cap() in a \link | ||
3751 | #cap_in_a_loop loop \endlink. | ||
3752 | |||
3753 | The order of elements in the string list is as follows. The first | ||
3754 | element is the entire matching string. Each subsequent element | ||
3755 | corresponds to the next capturing open left parentheses. Thus | ||
3756 | capturedTexts()[1] is the text of the first capturing parentheses, | ||
3757 | capturedTexts()[2] is the text of the second and so on | ||
3758 | (corresponding to $1, $2, etc., in some other regexp languages). | ||
3759 | |||
3760 | \sa cap() pos() exactMatch() search() searchRev() | ||
3761 | */ | ||
3762 | QStringList QRegExp::capturedTexts() | ||
3763 | { | ||
3764 | if ( priv->capturedCache.isEmpty() ) { | ||
3765 | for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) { | ||
3766 | QString m; | ||
3767 | if ( priv->captured[i + 1] == 0 ) | ||
3768 | m = QString::fromLatin1( "" ); | ||
3769 | else if ( priv->captured[i] >= 0 ) | ||
3770 | m = priv->t.mid( priv->captured[i], | ||
3771 | priv->captured[i + 1] ); | ||
3772 | priv->capturedCache.append( m ); | ||
3773 | } | ||
3774 | priv->t = QString::null; | ||
3775 | } | ||
3776 | return priv->capturedCache; | ||
3777 | } | ||
3778 | |||
3779 | /*! | ||
3780 | Returns the text captured by the \a nth subexpression. The entire | ||
3781 | match has index 0 and the parenthesized subexpressions have | ||
3782 | indices starting from 1 (excluding non-capturing parentheses). | ||
3783 | |||
3784 | \code | ||
3785 | QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" ); | ||
3786 | int pos = rxlen.search( "Length: 189cm" ); | ||
3787 | if ( pos > -1 ) { | ||
3788 | QString value = rxlen.cap( 1 ); // "189" | ||
3789 | QString unit = rxlen.cap( 2 ); // "cm" | ||
3790 | // ... | ||
3791 | } | ||
3792 | \endcode | ||
3793 | |||
3794 | The order of elements matched by cap() is as follows. The first | ||
3795 | element, cap(0), is the entire matching string. Each subsequent | ||
3796 | element corresponds to the next capturing open left parentheses. | ||
3797 | Thus cap(1) is the text of the first capturing parentheses, cap(2) | ||
3798 | is the text of the second, and so on. | ||
3799 | |||
3800 | \target cap_in_a_loop | ||
3801 | Some patterns may lead to a number of matches which cannot be | ||
3802 | determined in advance, for example: | ||
3803 | |||
3804 | \code | ||
3805 | QRegExp rx( "(\\d+)" ); | ||
3806 | str = "Offsets: 12 14 99 231 7"; | ||
3807 | QStringList list; | ||
3808 | pos = 0; | ||
3809 | while ( pos >= 0 ) { | ||
3810 | pos = rx.search( str, pos ); | ||
3811 | if ( pos > -1 ) { | ||
3812 | list += rx.cap( 1 ); | ||
3813 | pos += rx.matchedLength(); | ||
3814 | } | ||
3815 | } | ||
3816 | // list contains "12", "14", "99", "231", "7" | ||
3817 | \endcode | ||
3818 | |||
3819 | \sa capturedTexts() pos() exactMatch() search() searchRev() | ||
3820 | */ | ||
3821 | QString QRegExp::cap( int nth ) | ||
3822 | { | ||
3823 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) | ||
3824 | return QString::null; | ||
3825 | else | ||
3826 | return capturedTexts()[nth]; | ||
3827 | } | ||
3828 | |||
3829 | /*! | ||
3830 | Returns the position of the \a nth captured text in the searched | ||
3831 | string. If \a nth is 0 (the default), pos() returns the position | ||
3832 | of the whole match. | ||
3833 | |||
3834 | Example: | ||
3835 | \code | ||
3836 | QRegExp rx( "/([a-z]+)/([a-z]+)" ); | ||
3837 | rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null) | ||
3838 | rx.pos( 0 ); // returns 7 (position of /dev/null) | ||
3839 | rx.pos( 1 ); // returns 8 (position of dev) | ||
3840 | rx.pos( 2 ); // returns 12 (position of null) | ||
3841 | \endcode | ||
3842 | |||
3843 | For zero-length matches, pos() always returns -1. (For example, if | ||
3844 | cap(4) would return an empty string, pos(4) returns -1.) This is | ||
3845 | due to an implementation tradeoff. | ||
3846 | |||
3847 | \sa capturedTexts() exactMatch() search() searchRev() | ||
3848 | */ | ||
3849 | int QRegExp::pos( int nth ) | ||
3850 | { | ||
3851 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) | ||
3852 | return -1; | ||
3853 | else | ||
3854 | return priv->captured[2 * nth]; | ||
3855 | } | ||
3856 | |||
3857 | /*! | ||
3858 | Returns a text string that explains why a regexp pattern is | ||
3859 | invalid the case being; otherwise returns "no error occurred". | ||
3860 | |||
3861 | \sa isValid() | ||
3862 | */ | ||
3863 | QString QRegExp::errorString() | ||
3864 | { | ||
3865 | if ( isValid() ) { | ||
3866 | return QString( RXERR_OK ); | ||
3867 | } else { | ||
3868 | return eng->errorString(); | ||
3869 | } | ||
3870 | } | ||
3871 | #endif | ||
3872 | |||
3873 | /*! | ||
3874 | Returns the string \a str with every regexp special character | ||
3875 | escaped with a backslash. The special characters are $, (, ), *, +, | ||
3876 | ., ?, [, \, ], ^, {, | and }. | ||
3877 | |||
3878 | Example: | ||
3879 | \code | ||
3880 | s1 = QRegExp::escape( "bingo" ); // s1 == "bingo" | ||
3881 | s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)" | ||
3882 | \endcode | ||
3883 | |||
3884 | This function is useful to construct regexp patterns dynamically: | ||
3885 | |||
3886 | \code | ||
3887 | QRegExp rx( "(" + QRegExp::escape(name) + | ||
3888 | "|" + QRegExp::escape(alias) + ")" ); | ||
3889 | \endcode | ||
3890 | */ | ||
3891 | QString QRegExp::escape( const QString& str ) | ||
3892 | { | ||
3893 | static const char meta[] = "$()*+.?[\\]^{|}"; | ||
3894 | QString quoted = str; | ||
3895 | int i = 0; | ||
3896 | |||
3897 | while ( i < (int) quoted.length() ) { | ||
3898 | if ( strchr(meta, quoted[i].latin1()) != 0 ) | ||
3899 | quoted.insert( i++, "\\" ); | ||
3900 | i++; | ||
3901 | } | ||
3902 | return quoted; | ||
3903 | } | ||
3904 | |||
3905 | void QRegExp::compile( bool caseSensitive ) | ||
3906 | { | ||
3907 | derefEngine( eng, priv->rxpattern ); | ||
3908 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3909 | if ( priv->wc ) | ||
3910 | priv->rxpattern = wc2rx( priv->pattern ); | ||
3911 | else | ||
3912 | #endif | ||
3913 | priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" ) | ||
3914 | : priv->pattern; | ||
3915 | eng = newEngine( priv->rxpattern, caseSensitive ); | ||
3916 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3917 | priv->t = QString(); | ||
3918 | priv->capturedCache.clear(); | ||
3919 | #endif | ||
3920 | priv->captured.detach(); | ||
3921 | priv->captured.fill( -1, 2 + 2 * eng->numCaptures() ); | ||
3922 | } | ||
3923 | |||
3924 | int QRegExp::caretIndex( int offset, CaretMode caretMode ) | ||
3925 | { | ||
3926 | if ( caretMode == CaretAtZero ) { | ||
3927 | return 0; | ||
3928 | } else if ( caretMode == CaretAtOffset ) { | ||
3929 | return offset; | ||
3930 | } else { // CaretWontMatch | ||
3931 | return -1; | ||
3932 | } | ||
3933 | } | ||
3934 | |||
3935 | #endif // QT_NO_REGEXP | ||