author | jowenn <jowenn> | 2002-11-10 21:08:01 (UTC) |
---|---|---|
committer | jowenn <jowenn> | 2002-11-10 21:08:01 (UTC) |
commit | e97a6da57804aa14907dec327fbae71bff9b383e (patch) (unidiff) | |
tree | 15f6ee292dba24bdda72f5c72f6d2224c3516763 /noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp | |
parent | 7c012ee8cd16d8befacc6f6750711443fac0fd5e (diff) | |
download | opie-e97a6da57804aa14907dec327fbae71bff9b383e.zip opie-e97a6da57804aa14907dec327fbae71bff9b383e.tar.gz opie-e97a6da57804aa14907dec327fbae71bff9b383e.tar.bz2 |
import of tiny kate. (saving not possible yet)
Diffstat (limited to 'noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp') (more/less context) (ignore whitespace changes)
-rw-r--r-- | noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp | 3767 |
1 files changed, 3767 insertions, 0 deletions
diff --git a/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp b/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp new file mode 100644 index 0000000..a2c680f --- a/dev/null +++ b/noncore/apps/tinykate/libkate/qt3back/qregexp3.cpp | |||
@@ -0,0 +1,3767 @@ | |||
1 | /**************************************************************************** | ||
2 | ** $Id$ | ||
3 | ** | ||
4 | ** Implementation of QRegExp class | ||
5 | ** | ||
6 | ** Created : 950126 | ||
7 | ** | ||
8 | ** Copyright (C) 1992-2000 Trolltech AS. All rights reserved. | ||
9 | ** | ||
10 | ** This file is part of the tools module of the Qt GUI Toolkit. | ||
11 | ** | ||
12 | ** This file may be distributed under the terms of the Q Public License | ||
13 | ** as defined by Trolltech AS of Norway and appearing in the file | ||
14 | ** LICENSE.QPL included in the packaging of this file. | ||
15 | ** | ||
16 | ** This file may be distributed and/or modified under the terms of the | ||
17 | ** GNU General Public License version 2 as published by the Free Software | ||
18 | ** Foundation and appearing in the file LICENSE.GPL included in the | ||
19 | ** packaging of this file. | ||
20 | ** | ||
21 | ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition | ||
22 | ** licenses may use this file in accordance with the Qt Commercial License | ||
23 | ** Agreement provided with the Software. | ||
24 | ** | ||
25 | ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE | ||
26 | ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. | ||
27 | ** | ||
28 | ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for | ||
29 | ** information about Qt Commercial License Agreements. | ||
30 | ** See http://www.trolltech.com/qpl/ for QPL licensing information. | ||
31 | ** See http://www.trolltech.com/gpl/ for GPL licensing information. | ||
32 | ** | ||
33 | ** Contact info@trolltech.com if any conditions of this licensing are | ||
34 | ** not clear to you. | ||
35 | ** | ||
36 | **********************************************************************/ | ||
37 | #if QT_VERSION >=300 | ||
38 | #error QRegExp3 is now in QT 3 use QRegExp instead | ||
39 | #endif | ||
40 | |||
41 | #include "qarray.h" | ||
42 | #include "qbitarray.h" | ||
43 | #include "qcache.h" | ||
44 | #include "qintdict.h" | ||
45 | #include "qmap.h" | ||
46 | #if QT_VERSION < 300 | ||
47 | #include "./qregexp3.h" | ||
48 | #else | ||
49 | #include "qregexp.h" | ||
50 | #endif | ||
51 | #include "qstring.h" | ||
52 | #include "qtl.h" | ||
53 | #include "qvector.h" | ||
54 | |||
55 | #include <limits.h> | ||
56 | |||
57 | /* | ||
58 | WARNING! Be sure to read qregexp.tex before modifying this file. | ||
59 | */ | ||
60 | |||
61 | /*! | ||
62 | \class QRegExp3 qregexp.h | ||
63 | |||
64 | \brief The QRegExp class provides pattern matching using regular expressions. | ||
65 | |||
66 | \ingroup tools | ||
67 | \ingroup misc | ||
68 | \ingroup shared | ||
69 | |||
70 | |||
71 | Regular expressions, "regexps", provide a way to find patterns | ||
72 | within text. This is useful in many contexts, for example: | ||
73 | |||
74 | <ol> | ||
75 | <li>\e Validation. A regexp can be used to check whether a piece of | ||
76 | text meets some criteria, e.g. is an integer or contains no | ||
77 | whitespace. | ||
78 | <li>\e Searching. Regexps provide a much more powerful means of | ||
79 | searching text than simple string matching does. For example we can | ||
80 | create a regexp which says "find one of the words 'mail', 'letter' | ||
81 | or 'correspondence' but not any of the words 'email', 'mailman' | ||
82 | 'mailer', 'letterbox' etc." | ||
83 | <li><em>Search and Replace.</em> A regexp can be used to replace a | ||
84 | pattern with a piece of text, for example replace all occurrences of | ||
85 | '&' with '\&' except where the '&' is already followed by | ||
86 | 'amp;'. | ||
87 | <li><em>String Splitting.</em> A regexp can be used to identify | ||
88 | where a string should be split into its component fields, e.g. | ||
89 | splitting tab delimited strings. | ||
90 | </ol> | ||
91 | |||
92 | We present a very brief introduction to regexps, a description of | ||
93 | Qt's regexp language, some code examples, and finally the function | ||
94 | documentation. QRegExp is modelled on Perl's regexp engine and fully | ||
95 | supports Unicode. QRegExp may also be used in the weaker 'wildcard' | ||
96 | (globbing) mode which works in a similar way to command shells. A | ||
97 | good text on regexps is <i>Mastering Regular Expressions: Powerful | ||
98 | Techniques for Perl and Other Tools</i> by Jeffrey E. Friedl, ISBN | ||
99 | 1565922573. | ||
100 | |||
101 | Experienced regexp users may prefer to skip the introduction and | ||
102 | go directly to the relevant information: | ||
103 | |||
104 | <ul> | ||
105 | <li><a href="#characters-and-abbreviations-for-sets-of-characters"> | ||
106 | Characters and Abbreviations for Sets of Characters</a> | ||
107 | <li><a href="#sets-of-characters">Sets of Characters</a> | ||
108 | <li><a href="#quantifiers">Quantifiers</a> | ||
109 | <li><a href="#capturing-text">Capturing Text</a> | ||
110 | <li><a href="#assertions">Assertions</a> | ||
111 | <li><a href="#wildcard-matching">Wildcard Matching (globbing)</a> | ||
112 | <li><a href="#perl-users">Notes for Perl Users</a> | ||
113 | <li><a href="#code-examples">Code Examples</a> | ||
114 | <li><a href="#member-function-documentation">Member Function Documentation</a> | ||
115 | </ul> | ||
116 | |||
117 | <b>Introduction</b> | ||
118 | |||
119 | Regexps are built up from expressions, quantifiers and assertions. | ||
120 | The simplest form of expression is simply a character, e.g. <b>x</b> | ||
121 | or <b>5</b>. An expression can also be a set of characters. For | ||
122 | example, <b>[ABCD]</b>, will match an <b>A</b> or a <b>B</b> or a | ||
123 | <b>C</b> or a <b>D</b>. As a shorthand we could write this as | ||
124 | <b>[A-D]</b>. If we want to match any of the captital letters in the | ||
125 | English alphabet we can write <b>[A-Z]</b>. A quantifier tells the | ||
126 | regexp engine how many occurrences of the expression we want, e.g. | ||
127 | <b>x{1,1}</b> means match an <b>x</b> which occurs at least once and | ||
128 | at most once. We'll look at assertions and more complex expressions | ||
129 | later. Note that regexps cannot be used to check for balanced | ||
130 | brackets or tags (unless you know the maximum level of nesting). | ||
131 | |||
132 | |||
133 | We'll start by writing a regexp to match integers in the range 0 to | ||
134 | 99. We will require at least one digit so we will start with | ||
135 | <b>[0-9]{1,1}</b> which means match a digit exactly once. This | ||
136 | regexp alone will match integers in the range 0 to 9. To match one | ||
137 | or two digits we can increase the maximum number of occurrences so | ||
138 | the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at least | ||
139 | once and at most twice. However, this regexp as it stands will not | ||
140 | match correctly. This regexp will match one or two digits \e within | ||
141 | a string. To ensure that we match against the whole string we must | ||
142 | use the anchor assertions. We need <b>^</b> (caret) which when it is | ||
143 | the first character in the regexp means that the regexp must match | ||
144 | from the beginning of the string. And we also need <b>$</b> (dollar) | ||
145 | which when it is the last character in the regexp means that the | ||
146 | regexp must match until the end of the string. So now our regexp is | ||
147 | <b>^[0-9]{1,2}$</b>. Note that assertions do not match any | ||
148 | characters. | ||
149 | |||
150 | If you've seen regexps elsewhere they may have looked different from | ||
151 | the one above. This is because some sets of characters and some | ||
152 | quantifiers are so common that they have special symbols to | ||
153 | represent them. <b>[0-9]</b> can be replaced with the symbol | ||
154 | <b>\d</b>. The quantifier to match exactly one occurrence, | ||
155 | <b>{1,1}</b>, can be replaced with the expression itself. This means | ||
156 | that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0 | ||
157 | to 99 matcher could be written: <b>^\d{1,2}$</b>, although most | ||
158 | people would write it <b>^\d\d?$</b>. The <b>?</b> is the same as | ||
159 | the quantifier <b>{0,1}</b>, i.e. a minimum of no occurrences a | ||
160 | maximum of one occurrence. This is used to make an expression | ||
161 | optional. The regexp <b>^\d\d?$</b> means "from the beginning of the | ||
162 | string match one digit followed by zero or one digits and then the | ||
163 | end of the string". | ||
164 | |||
165 | Our second example is matching the words 'mail', 'letter' or | ||
166 | 'correspondence' but without matching 'email', 'mailman', 'mailer', | ||
167 | 'letterbox' etc. We'll start by just matching 'mail'. In full the | ||
168 | regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since an expression | ||
169 | itself is automatically quantified by <b>{1,1}</b> we can simply | ||
170 | write this as <b>mail</b>; an 'm' followed by an 'a' followed by an | ||
171 | 'i' followed by an 'l'. The symbol '|' (bar) is used for \e | ||
172 | alternation, so our regexp now becomes | ||
173 | <b>mail|letter|correspondence</b> which means match 'mail' \e or | ||
174 | 'letter' \e or 'correspondence'. Whilst this regexp will find the | ||
175 | words we want it will also find words we don't want such as 'email'. | ||
176 | We will start by putting our regexp in parenthesis | ||
177 | <b>(mail|letter|correspondence)</b>. Parenthesis have two effects, | ||
178 | firstly they group expressions together and secondly they identify | ||
179 | parts of the regexp that we wish to <a href="#capturing-text">capture</a>. | ||
180 | Our regexp still matches any of the three words but now they are | ||
181 | grouped together as a unit. This is useful for building up more | ||
182 | complex regexps. It is also useful because it allows us to examine | ||
183 | which of the words actually matched. We need to use another | ||
184 | assertion, this time <b>\b</b> "word boundary": | ||
185 | <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match a | ||
186 | word boundary followed by the expression in parenthesis followed by | ||
187 | another word boundary". The <b>\b</b> assertion matches at a \e | ||
188 | position in the regexp not a \e character in the regexp. A word | ||
189 | boundary is any non-word character such as a space a newline or the | ||
190 | beginning or end of the string. | ||
191 | |||
192 | For our third example we want to replace ampersands with the HTML | ||
193 | entity '\&'. The regexp to match is simple: <b>\&</b>, i.e. | ||
194 | match one ampersand. Unfortunately this will mess up our text if | ||
195 | some of the ampersands have already been turned into HTML entities. | ||
196 | So what we really want to say is replace an ampersand providing it | ||
197 | is not followed by 'amp;'. For this we need the negative lookahead | ||
198 | assertion and our regexp becomes: <b>\&(?!amp;)</b>. The negative | ||
199 | lookahead assertion is introduced with '(?!' and finishes at the | ||
200 | ')'. It means that the text it contains, 'amp;' in our example, must | ||
201 | \e not follow the expression that preceeds it. | ||
202 | |||
203 | Regexps provide a rich language that can be used in a variety of | ||
204 | ways. For example suppose we want to count all the occurrences of | ||
205 | 'Eric' and 'Eirik' in a string. Two valid regexps to match these are | ||
206 | <b>\\</b><b>b(Eric|Eirik)</b><b>\\</b><b>b</b> and | ||
207 | <b>\\</b><b>bEi?ri[ck]</b><b>\\</b><b>b</b>. We need the word boundary | ||
208 | '\b' so we don't get 'Ericsson' etc. The second regexp actually | ||
209 | matches more than we want, 'Eric', 'Erik', 'Eiric' and 'Eirik'. | ||
210 | |||
211 | We will implement some the examples above in the | ||
212 | <a href="#code-examples">code examples</a> section. | ||
213 | |||
214 | <a name="characters-and-abbreviations-for-sets-of-characters"> | ||
215 | <b>Characters and Abbreviations for Sets of Characters</b></a> | ||
216 | |||
217 | <ul> | ||
218 | |||
219 | <li><b>c</b> Any character represents itself unless it has a special regexp | ||
220 | meaning. Thus <b>c</b> matches the character \e c. | ||
221 | |||
222 | <li><b>\\</b><b>c</b> A character that follows a backslash matches the | ||
223 | character itself except where mentioned below. For example if you | ||
224 | wished to match a literal caret at the beginning of a string you | ||
225 | would write <b>\^</b>. | ||
226 | |||
227 | <li><b>\\</b><b>a</b> This matches the ASCII bell character (BEL, 0x07). | ||
228 | <li><b>\\</b><b>f</b> This matches the ASCII form feed character (FF, 0x0C). | ||
229 | <li><b>\\</b><b>n</b> This matches the ASCII line feed character (LF, 0x0A), (Unix newline). | ||
230 | <li><b>\\</b><b>r</b> This matches the ASCII carriage return character (CR, 0x0D). | ||
231 | <li><b>\\</b><b>t</b> This matches the ASCII horizontal tab character (HT, 0x09). | ||
232 | <li><b>\\</b><b>v</b> This matches the ASCII vertical tab character (VT, 0x0B). | ||
233 | <li><b>\\</b><b>xhhhh</b> This matches the Unicode character corresponding | ||
234 | to the hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo | ||
235 | (i.e., \zero ooo) matches the ASCII/Latin-1 character corresponding | ||
236 | to the octal number ooo (between 0 and 0377). | ||
237 | <li><b>. (dot)</b> This matches any character (including newline). | ||
238 | <li><b>\\</b><b>d</b> This matches a digit (see QChar::isDigit()). | ||
239 | <li><b>\\</b><b>D</b> This matches a non-digit. | ||
240 | <li><b>\\</b><b>s</b> This matches a whitespace (see QChar::isSpace()). | ||
241 | <li><b>\\</b><b>S</b> This matches a non-whitespace. | ||
242 | <li><b>\\</b><b>w</b> This matches a word character (see QChar::isLetterOrNumber()). | ||
243 | <li><b>\\</b><b>W</b> This matches a non-word character. | ||
244 | <li><b>\\</b><b>n</b> The n<sup>th</sup> | ||
245 | <a href="#capturing-text">backreference</a>, e.g. \1, \2, etc. | ||
246 | </ul> | ||
247 | |||
248 | <em>Note that the C++ compiler transforms backslashes in strings so | ||
249 | to include a <b>\\</b> in a regexp you will need to enter it twice, | ||
250 | i.e. <b>\\</b><b>\\</b>.</em> | ||
251 | |||
252 | <a name="sets-of-characters"><b>Sets of Characters</b></a> | ||
253 | |||
254 | Square brackets are used to match any character in the set of | ||
255 | characters contained within the square brackets. All the character | ||
256 | set abbreviations described above can be used within square | ||
257 | brackets. Apart from the character set abbreviations and the | ||
258 | following two exceptions no characters have special meanings in | ||
259 | square brackets. | ||
260 | |||
261 | <ul> | ||
262 | |||
263 | <li><b>^</b> The caret negates the character set if it occurs as the | ||
264 | first character, i.e. immediately after the opening square bracket. | ||
265 | For example, <b>[abc]</b> matches 'a' or 'b' or 'c', but | ||
266 | <b>[^abc]</b> matches anything \e except 'a' or 'b' or 'c'. | ||
267 | |||
268 | <li><b>-</b> The dash is used to indicate a range of characters, for | ||
269 | example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'. | ||
270 | |||
271 | </ul> | ||
272 | |||
273 | Using the predefined character set abbreviations is more portable | ||
274 | than using character ranges across platforms and languages. For | ||
275 | example, <b>[0-9]</b> matches a digit in Western alphabets but | ||
276 | <b>\d</b> matches a digit in \e any alphabet. | ||
277 | |||
278 | Note that in most regexp literature sets of characters are called | ||
279 | "character classes". | ||
280 | |||
281 | <a name="quantifiers"><b>Quantifiers</b></a> | ||
282 | |||
283 | By default an expression is automatically quantified by | ||
284 | <b>{1,1}</b>, i.e. it should occur exactly once. In the following | ||
285 | list <b><i>E</i></b> stands for any expression. An expression is a | ||
286 | character or an abbreviation for a set of characters or a set of | ||
287 | characters in square brackets or any parenthesised expression. | ||
288 | |||
289 | <ul> | ||
290 | |||
291 | <li><b><i>E</i>?</b> Matches zero or one occurrence of <i>E</i>. | ||
292 | This quantifier means "the previous expression is optional" since it | ||
293 | will match whether or not the expression occurs in the string. | ||
294 | It is the same as <b><i>E</i>{0,1}</b>. For example <b>dents?</b> | ||
295 | will match 'dent' and 'dents'. | ||
296 | |||
297 | <li><b><i>E</i>+</b> Matches one or more occurrences of <i>E</i>. | ||
298 | This is the same as <b><i>E</i>{1,MAXINT}</b>. For example, | ||
299 | <b>0+</b> will match '0', '00', '000', etc. | ||
300 | |||
301 | <li><b><i>E</i>*</b> Matches zero or more occurrences of <i>E</i>. | ||
302 | This is the same as <b><i>E</i>{0,MAXINT}</b>. The <b>*</b> | ||
303 | quantifier is often used by a mistake. Since it matches \e zero or | ||
304 | more occurrences it will match no occurrences at all. For example if | ||
305 | we want to match strings that end in whitespace and use the regexp | ||
306 | <b>\s*$</b> we would get a match on every string. This is because we | ||
307 | have said find zero or more whitespace followed by the end of string, | ||
308 | so even strings that don't end in whitespace will match. The regexp | ||
309 | we want in this case is <b>\s+$</b> to match strings that have at | ||
310 | least one whitespace at the end. | ||
311 | |||
312 | <li><b><i>E</i>{n}</b> Matches exactly \e n occurrences of the | ||
313 | expression. This is the same as repeating the expression \e n times. | ||
314 | For example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also the | ||
315 | same as <b><i>E</i>{n,n}</b>, e.g. <b>x{5,5}</b>. | ||
316 | |||
317 | <li><b><i>E</i>{n,}</b> Matches at least \e n occurrences of the | ||
318 | expression. This is the same as <b><i>E</i>{n,MAXINT}</b>. | ||
319 | |||
320 | <li><b><i>E</i>{,m}</b> Matches at most \e m occurrences of the | ||
321 | expression. This is the same as <b><i>E</i>{0,m}</b>. | ||
322 | |||
323 | <li><b><i>E</i>{n,m}</b> Matches at least \e n occurrences of the | ||
324 | expression and at most \e m occurrences of the expression. | ||
325 | |||
326 | </ul> | ||
327 | |||
328 | (MAXINT is implementation dependent but will not be smaller than | ||
329 | 16384.) | ||
330 | |||
331 | If we wish to apply a quantifier to more than just the preceeding | ||
332 | character we can use parenthesis to group characters together in an | ||
333 | expression. For example, <b>tag+</b> matches a 't' followed by an | ||
334 | 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches at | ||
335 | least one occurrence of 'tag'. | ||
336 | |||
337 | Note that quantifiers are "greedy", they will match as much text as | ||
338 | they can. For example, <b>0+</b> will match as many zeros as it can | ||
339 | from the first zero it finds, e.g. '2.<u>000</u>5'. Quantifiers can | ||
340 | be made non-greedy, see setMinimal(). | ||
341 | |||
342 | <a name="capturing-text"><b>Capturing Text</b></a> | ||
343 | |||
344 | Parenthesis allow us to group elements together so that we can | ||
345 | quantify and capture them. For example if we have the expression | ||
346 | <b>mail|letter|correspondence</b> that matches a string we know that | ||
347 | \e one of the words matched but not which one. Using parenthesis | ||
348 | allows us to "capture" whatever is matched within their bounds, so | ||
349 | if we used <b>(mail|letter|correspondence)</b> and matched this | ||
350 | regexp against the string "I sent you some email" we can use the | ||
351 | cap() or capturedTexts() functions to extract the matched | ||
352 | characters, in this case 'mail'. | ||
353 | |||
354 | We can use captured text within the regexp itself. To refer to the | ||
355 | captured text we use \e backreferences which are indexed from 1 the | ||
356 | same as for cap(). For example we could search for duplicate words | ||
357 | in a string using <b>\b(\w+)\W+\1\b</b> which means match a word | ||
358 | boundary followed by one or more word characters followed by one or | ||
359 | more non-word characters followed by the same text as the first | ||
360 | parenthesised expression followed by a word boundary. | ||
361 | |||
362 | If we want to use parenthesis purely for grouping and not for | ||
363 | capturing we use the non-capturing syntax, e.g. | ||
364 | <b>(?:green|blue)</b>. Non-capturing parenthesis begin '(?:' and end | ||
365 | ')'. In this example we match either 'green' or 'blue' but we do not | ||
366 | capture the match so we can only know whether or not we matched but | ||
367 | not which color we actually found. Using non-capturing parenthesis | ||
368 | is more efficient than using capturing parenthesis since the regexp | ||
369 | engine has to do less book-keeping. | ||
370 | |||
371 | Both capturing and non-capturing parenthesis may be nested. | ||
372 | |||
373 | <a name="assertions"><b>Assertions</b></a> | ||
374 | |||
375 | Assertions make some statement about the text at the point where | ||
376 | they occur in the regexp but they do not match any characters. | ||
377 | In the following list <b><i>E</i></b> stands for any expression. | ||
378 | |||
379 | <ul> | ||
380 | <li><b>^</b> If the caret is the first character in the regexp | ||
381 | (apart from opening parenthesis) it signifies the beginning of the | ||
382 | string. It has no special meaning elsewhere (except as the first | ||
383 | character of a set of characters in square brackets). For example, | ||
384 | <b>^#include</b> will only match strings which \e begin with the | ||
385 | characters '#include'. | ||
386 | |||
387 | <li><b>$</b> If the dollar is the last character in the regexp | ||
388 | (apart from closing parenthesis) it signifies the end of the string. | ||
389 | It has no special meaning elsewhere. For example, <b>\d\s*$</b> | ||
390 | will match strings which end with a digit optionally followed by | ||
391 | whitespace. | ||
392 | |||
393 | <li><b>\\</b><b>b</b> A word boundary. For example the regexp | ||
394 | <b>\\</b><b>bOK</b>\\</b><b>b</b> means match immediately after a | ||
395 | word boundary (e.g. start of string or whitespace) the letter 'O' | ||
396 | then the letter 'K' immediately before another word boundary (e.g. | ||
397 | end of string or whitespace). But note that the assertion does not | ||
398 | actually match any whitespace so if we write | ||
399 | <b>(</b><b>\\</b><b>bOK</b>\\</b><b>b)</b> and we have a match it | ||
400 | will only contain 'OK' even if the string is "Its <u>OK</u> now". | ||
401 | |||
402 | <li><b>\\</b><b>B</b> A non-word boundary. This assertion is true | ||
403 | wherever <b>\\</b><b>b</b> is false. For example if we searched for | ||
404 | <b>\\</b><b>Bon</b>\\</b><b>B</b> in "Left on" the match would fail | ||
405 | (space and end of string aren't non-word boundaries), but it would | ||
406 | match in "t<u>on</u>ne". | ||
407 | |||
408 | <li><b>(?=<i>E</i>)</b> Positive lookahead. This assertion is true | ||
409 | if the expression matches at this point in the regex. This assertion | ||
410 | does not match any characters. For example, | ||
411 | <b>^#define\s+(\w+)(?=MAX)</b> will match strings which begin with | ||
412 | '#define' followed by at least one whitespace followed by at least | ||
413 | one word character followed by 'MAX'. The first set of parenthesis | ||
414 | will capture the word character(s) matched. This regexp will not | ||
415 | match '#define DEBUG' but will match '#define <u>INT</u>MAX | ||
416 | 32767'. | ||
417 | |||
418 | <li><b>(?!<i>E</i>)</b> Negative lookahead. This assertion is true | ||
419 | if the expression does not match at this point in the regex. This | ||
420 | assertion does not match any characters. For example, | ||
421 | <b>^#define\s+(\w+)\s*$</b> will match strings which begin with | ||
422 | '#define' followed by at least one whitespace followed by at least | ||
423 | one word character optionally followed by whitespace. This regexp | ||
424 | will match define's that exist but have no value, i.e. it will not | ||
425 | match '#define INTMAX 32767' but it will match '#define <u>DEBUG</u>'. | ||
426 | |||
427 | </ul> | ||
428 | |||
429 | <a name="wildcard-matching"><b>Wildcard Matching (globbing)</b></a> | ||
430 | |||
431 | Most command shells such as \e bash or \e cmd support "file | ||
432 | globbing", the ability to identify a group of files by using | ||
433 | wildcards. Wildcard matching is much simpler than full regexps and | ||
434 | has only four features: | ||
435 | |||
436 | <ul> | ||
437 | |||
438 | <li><b>c</b> Any character represents itself apart from those | ||
439 | mentioned below. Thus <b>c</b> matches the character \e c. | ||
440 | |||
441 | <li><b>?</b> This matches any single character. It is the same as | ||
442 | <b>.</b> in full regexps. | ||
443 | |||
444 | <li><b>*</b> This matches zero or more of any characters. It is the | ||
445 | same as <b>.*</b> in full regexps. | ||
446 | |||
447 | <li><b>[...]</b> Sets of characters can be represented in square | ||
448 | brackets the same as for full regexps. | ||
449 | |||
450 | <!-- JASMIN: Are the character classes, \w, etc supported in | ||
451 | wildcards? --> | ||
452 | |||
453 | </ul> | ||
454 | |||
455 | For example if we are in wildcard mode and have strings which | ||
456 | contain filenames we could identify HTML files with <b>*.html</b>. | ||
457 | This will match zero or more characters followed by a dot followed | ||
458 | by 'h', 't', 'm' and 'l'. | ||
459 | |||
460 | <a name="perl-users"><b>Notes for Perl Users</b></a> | ||
461 | |||
462 | Most of the character class abbreviations supported by Perl are | ||
463 | supported by QRegExp, see | ||
464 | <a href="#characters-and-abbreviations-for-sets-of-characters"> | ||
465 | characters and abbreviations for sets of characters</a>. | ||
466 | |||
467 | QRegExp's quantifiers are the same as Perl's greedy quantifiers. | ||
468 | Non-greedy matching cannot be applied to individual quantifiers, but | ||
469 | can be applied to all the quantifiers in the pattern. For example, | ||
470 | to match the Perl regex <b>ro+?m</b> requires: | ||
471 | \code | ||
472 | QRegExp rx( "ro+m" ); | ||
473 | rx.setMinimal( TRUE ); | ||
474 | \endcode | ||
475 | |||
476 | The equivalent of Perl's <tt>/i</tt> option is | ||
477 | setCaseSensitive(FALSE). | ||
478 | |||
479 | Perl's <tt>/g</tt> option can be emulated using a | ||
480 | <a href="#cap_in_a_loop">loop</a>. | ||
481 | |||
482 | In QRegExp <b>.</b> matches any character, therefore all QRegExp | ||
483 | regexps have the equivalent of Perl's <tt>/s</tt> option. QRegExp | ||
484 | does not have an equivalent to Perl's <tt>/m</tt> option, but this | ||
485 | can be emulated in various ways for example by splitting the input | ||
486 | into lines or by looping with a regexp that searches for newlines. | ||
487 | |||
488 | Because QRegExp is string oriented there are no \A, \Z or \z | ||
489 | assertions. The \G assertion is not supported but can be emulated in | ||
490 | a loop. | ||
491 | |||
492 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp | ||
493 | equivalents for $`, $' or $+. $1, $2 etc correspond to | ||
494 | cap(1) or capturedTexts()[1], cap(2) or capturedTexts()[2], etc. | ||
495 | |||
496 | To substitute a pattern use QString::replace(). | ||
497 | |||
498 | Perl's extended <tt>/x</tt> syntax is not supported, nor are regexp | ||
499 | comments (?#comment) or directives, e.g. (?i). | ||
500 | |||
501 | Both zero-width positive and zero-width negative lookahead | ||
502 | assertions (?=pattern) and (?!pattern) are supported with the same | ||
503 | syntax as Perl. Perl's lookbehind assertions, "independent" | ||
504 | subexpressions and conditional expressions are not supported. | ||
505 | |||
506 | Non-capturing parenthesis are also supported, with the same | ||
507 | (?:pattern) syntax. | ||
508 | |||
509 | See QStringList::split() and QStringList::join() for equivalents to | ||
510 | Perl's split and join functions. | ||
511 | |||
512 | Note: because C++ transforms \\'s they must be written \e twice in | ||
513 | code, e.g. <b>\\</b><b>b</b> must be written <b>\\</b><b>\\</b><b>b</b>. | ||
514 | |||
515 | <a name="code-examples"><b>Code Examples</b></a> | ||
516 | |||
517 | \code | ||
518 | QRegExp rx( "^\\d\\d?$" );// Match integers 0 to 99 | ||
519 | rx.search( "123" ); // Returns -1 (no match) | ||
520 | rx.search( "-6" ); // Returns -1 (no match) | ||
521 | rx.search( "6" ); // Returns 0 (matched as position 0) | ||
522 | \endcode | ||
523 | |||
524 | The third string matches '<u>6</u>'. This is a simple validation | ||
525 | regexp for integers in the range 0 to 99. | ||
526 | |||
527 | \code | ||
528 | QRegExp rx( "^\\S+$" );// Match strings which have no whitespace | ||
529 | rx.search( "Hello world" );// Returns -1 (no match) | ||
530 | rx.search( "This_is-OK" );// Returns 0 (matched at position 0) | ||
531 | \endcode | ||
532 | |||
533 | The second string matches '<u>This_is-OK</u>'. We've used the | ||
534 | character set abbreviation '\S' (non-whitespace) and the anchors to | ||
535 | match strings which contain no whitespace. | ||
536 | |||
537 | In the following example we match strings containing 'mail' or | ||
538 | 'letter' or 'correspondence' but only match whole words i.e. not | ||
539 | 'email' | ||
540 | |||
541 | \code | ||
542 | QRegExp rx( "\\b(mail|letter|correspondence)\\b" ); | ||
543 | rx.search( "I sent you an email" ); // Returns -1 (no match) | ||
544 | rx.search( "Please write the letter" ); // Returns 17 (matched at position 17) | ||
545 | \endcode | ||
546 | |||
547 | The second string matches "Please write the <u>letter</u>". The word | ||
548 | 'letter' is also captured (because of the parenthesis). We can see | ||
549 | what text we've captured like this: | ||
550 | |||
551 | \code | ||
552 | QString captured = rx.cap( 1 ); // captured contains "letter" | ||
553 | \endcode | ||
554 | |||
555 | This will capture the text from the first set of capturing | ||
556 | parenthesis (counting capturing left parenthesis from left to | ||
557 | right). The parenthesis are counted from 1 since cap( 0 ) is the | ||
558 | whole matched regexp (equivalent to '&' in most regexp engines). | ||
559 | |||
560 | \code | ||
561 | QRegExp rx( "&(?!amp;)" ); // Match ampersands but not & | ||
562 | QString line1 = "This & that"; | ||
563 | line1.replace( rx, "&" ); | ||
564 | // line1 == "This & that" | ||
565 | QString line2 = "His & hers & theirs"; | ||
566 | line2.replace( rx, "&" ); | ||
567 | // line2 == "His & hers & theirs" | ||
568 | \endcode | ||
569 | |||
570 | Here we've passed the QRegExp to QString's replace() function to | ||
571 | replace the matched text with new text. | ||
572 | |||
573 | \code | ||
574 | QString str = "One Eric another Eirik, and an Ericsson. How many Eiriks, Eric?"; | ||
575 | QRegExp rx( "\\b(Eric|Eirik)\\b" );// Match Eric or Eirik | ||
576 | int pos = 0; // Where we are in the string | ||
577 | int count = 0; // How many Eric and Eirik's we've counted | ||
578 | while ( pos >= 0 ) { | ||
579 | pos = rx.search( str, pos ); | ||
580 | if ( pos >= 0 ) { | ||
581 | pos++;// Move along in str | ||
582 | count++;// Count our Eric or Eirik | ||
583 | } | ||
584 | } | ||
585 | \endcode | ||
586 | |||
587 | We've used the search() function to repeatedly match the regexp in | ||
588 | the string. Note that instead of moving forward by one character at | ||
589 | a time <tt>pos++</tt> we could have written <tt>pos += | ||
590 | rx.matchedLength()</tt> to skip over the already matched string. The | ||
591 | count will equal 3, matching 'One <u>Eric</u> another <u>Eirik</u>, | ||
592 | and an Ericsson. How many Eiriks, <u>Eric</u>?'; it doesn't match | ||
593 | 'Ericsson' or 'Eiriks' because they are not bounded by non-word | ||
594 | boundaries. | ||
595 | |||
596 | One common use of regexps is to split lines of delimited data into | ||
597 | their component fields. | ||
598 | |||
599 | \code | ||
600 | str = "Trolltech AS\twww.trolltech.com\tNorway"; | ||
601 | QString company, web, country; | ||
602 | rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" ); | ||
603 | if ( rx.search( str ) != -1 ) { | ||
604 | company = rx.cap( 1 ); | ||
605 | web= rx.cap( 2 ); | ||
606 | country = rx.cap( 3 ); | ||
607 | } | ||
608 | \endcode | ||
609 | |||
610 | In this example our input lines have the format company name, web | ||
611 | address and country. Unfortunately the regexp is rather long and not | ||
612 | very versatile -- the code will break if we add any more fields. A | ||
613 | simpler and better solution is to look for the separator, '\t' in | ||
614 | this case, and take the surrounding text. The QStringList split() | ||
615 | function can take a separator string or regexp as an argument and | ||
616 | split a string accordingly. | ||
617 | |||
618 | \code | ||
619 | QStringList field = QStringList::split( "\t", str ); | ||
620 | \endcode | ||
621 | |||
622 | Here field[0] is the company, field[1] the web address and so on. | ||
623 | |||
624 | To immitate the matching of a shell we can use wildcard mode. | ||
625 | |||
626 | \code | ||
627 | QRegExp rx( "*.html" );// Invalid regexp: * doesn't quantify anything | ||
628 | rx.setWildcard( TRUE );// Now its a valid wildcard regexp | ||
629 | rx.search( "index.html" );// Returns 0 (matched at position 0) | ||
630 | rx.search( "default.htm" );// Returns -1 (no match) | ||
631 | rx.search( "readme.txt" );// Returns -1 (no match) | ||
632 | \endcode | ||
633 | |||
634 | Wildcard matching can be convenient because of its simplicity, but | ||
635 | any wildcard regex can be defined using full regexps, e.g. | ||
636 | <b>.*\.html$</b>. Notice that we can't match both \c .html and \c | ||
637 | .htm files with a wildcard unless we use <b>*.htm*</b> which will | ||
638 | also match 'test.html.bak'. A full regexp gives us the precision we | ||
639 | need, <b>.*\.html?$</b>. | ||
640 | |||
641 | QRegExp can match case insensitively using setCaseSensitive(), and | ||
642 | can use non-greedy matching, see setMinimal(). By default QRegExp | ||
643 | uses full regexps but this can be changed with setWildcard(). | ||
644 | Searching can be forward with search() or backward with searchRev(). | ||
645 | Captured text can be accessed using capturedTexts() which returns a | ||
646 | string list of all captured strings, or using cap() which returns | ||
647 | the captured string for the given index. The pos() function takes a | ||
648 | match index and returns the position in the string where the match | ||
649 | was made (or -1 if there was no match). | ||
650 | |||
651 | \sa QRegExpValidator QString QStringList | ||
652 | |||
653 | <a name="member-function-documentation"/> | ||
654 | */ | ||
655 | |||
656 | static const int NumBadChars = 128; | ||
657 | #define BadChar( ch ) ( (ch).cell() % NumBadChars ) | ||
658 | |||
659 | static const int NoOccurrence = INT_MAX; | ||
660 | static const int EmptyCapture = INT_MAX; | ||
661 | static const int InftyLen = INT_MAX; | ||
662 | static const int InftyRep = 1000; | ||
663 | static const int EOS = -1; | ||
664 | |||
665 | #ifndef QT_NO_REGEXP_OPTIM | ||
666 | static int engCount = 0; | ||
667 | static QArray<int> *noOccurrences = 0; | ||
668 | static QArray<int> *firstOccurrenceAtZero = 0; | ||
669 | #endif | ||
670 | |||
671 | /* | ||
672 | Merges two QArrays of ints and puts the result into the first one. | ||
673 | */ | ||
674 | static void mergeInto( QArray<int> *a, const QArray<int>& b ) | ||
675 | { | ||
676 | int asize = a->size(); | ||
677 | int bsize = b.size(); | ||
678 | if ( asize == 0 ) { | ||
679 | *a = b.copy(); | ||
680 | #ifndef QT_NO_REGEXP_OPTIM | ||
681 | } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) { | ||
682 | a->resize( asize + 1 ); | ||
683 | (*a)[asize] = b[0]; | ||
684 | #endif | ||
685 | } else if ( bsize >= 1 ) { | ||
686 | int csize = asize + bsize; | ||
687 | QArray<int> c( csize ); | ||
688 | int i = 0, j = 0, k = 0; | ||
689 | while ( i < asize ) { | ||
690 | if ( j < bsize ) { | ||
691 | if ( (*a)[i] == b[j] ) { | ||
692 | i++; | ||
693 | csize--; | ||
694 | } else if ( (*a)[i] < b[j] ) { | ||
695 | c[k++] = (*a)[i++]; | ||
696 | } else { | ||
697 | c[k++] = b[j++]; | ||
698 | } | ||
699 | } else { | ||
700 | memcpy( c.data() + k, (*a).data() + i, | ||
701 | (asize - i) * sizeof(int) ); | ||
702 | break; | ||
703 | } | ||
704 | } | ||
705 | c.resize( csize ); | ||
706 | if ( j < bsize ) | ||
707 | memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) ); | ||
708 | *a = c; | ||
709 | } | ||
710 | } | ||
711 | |||
712 | /* | ||
713 | Merges two disjoint QMaps of (int, int) pairs and puts the result into the | ||
714 | first one. | ||
715 | */ | ||
716 | static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b ) | ||
717 | { | ||
718 | QMap<int, int>::ConstIterator it; | ||
719 | for ( it = b.begin(); it != b.end(); ++it ) | ||
720 | a->insert( it.key(), *it ); | ||
721 | } | ||
722 | |||
723 | /* | ||
724 | Returns the value associated to key k in QMap m of (int, int) pairs, or 0 if | ||
725 | no such value is explicitly present. | ||
726 | */ | ||
727 | static int at( const QMap<int, int>& m, int k ) | ||
728 | { | ||
729 | QMap<int, int>::ConstIterator it = m.find( k ); | ||
730 | if ( it == m.end() ) | ||
731 | return 0; | ||
732 | else | ||
733 | return *it; | ||
734 | } | ||
735 | |||
736 | #ifndef QT_NO_REGEXP_WILDCARD | ||
737 | /* | ||
738 | Translates a wildcard pattern to an equivalent regular expression pattern | ||
739 | (e.g., *.cpp to .*\.cpp). | ||
740 | */ | ||
741 | static QString wc2rx( const QString& wc ) | ||
742 | { | ||
743 | int wclen = wc.length(); | ||
744 | QString rx = QString::fromLatin1( "" ); | ||
745 | int i = 0; | ||
746 | while ( i < wclen ) { | ||
747 | QChar c = wc[i++]; | ||
748 | switch ( c.unicode() ) { | ||
749 | case '*': | ||
750 | rx += QString::fromLatin1( ".*" ); | ||
751 | break; | ||
752 | case '?': | ||
753 | rx += QChar( '.' ); | ||
754 | break; | ||
755 | case '$': | ||
756 | case '(': | ||
757 | case ')': | ||
758 | case '+': | ||
759 | case '.': | ||
760 | case '\\': | ||
761 | case '^': | ||
762 | case '{': | ||
763 | case '|': | ||
764 | case '}': | ||
765 | rx += QChar( '\\' ); | ||
766 | rx += c; | ||
767 | break; | ||
768 | case '[': | ||
769 | rx += c; | ||
770 | if ( wc[i] == QChar('^') ) | ||
771 | rx += wc[i++]; | ||
772 | if ( i < wclen ) { | ||
773 | if ( rx[i] == ']' ) | ||
774 | rx += wc[i++]; | ||
775 | while ( i < wclen && wc[i] != QChar(']') ) { | ||
776 | if ( wc[i] == '\\' ) | ||
777 | rx += QChar( '\\' ); | ||
778 | rx += wc[i++]; | ||
779 | } | ||
780 | } | ||
781 | break; | ||
782 | default: | ||
783 | rx += c; | ||
784 | } | ||
785 | } | ||
786 | return rx; | ||
787 | } | ||
788 | #endif | ||
789 | |||
790 | /* | ||
791 | The class QRegExpEngine encapsulates a modified nondeterministic finite | ||
792 | automaton (NFA). | ||
793 | */ | ||
794 | class QRegExpEngine : public QShared | ||
795 | { | ||
796 | public: | ||
797 | #ifndef QT_NO_REGEXP_CCLASS | ||
798 | /* | ||
799 | The class CharClass represents a set of characters, such as can be found | ||
800 | in regular expressions (e.g., [a-z] denotes the set {a, b, ..., z}). | ||
801 | */ | ||
802 | class CharClass | ||
803 | { | ||
804 | public: | ||
805 | CharClass(); | ||
806 | CharClass( const CharClass& cc ) { operator=( cc ); } | ||
807 | |||
808 | CharClass& operator=( const CharClass& cc ); | ||
809 | |||
810 | void clear(); | ||
811 | bool negative() const { return n; } | ||
812 | void setNegative( bool negative ); | ||
813 | void addCategories( int cats ); | ||
814 | void addRange( ushort from, ushort to ); | ||
815 | void addSingleton( ushort ch ) { addRange( ch, ch ); } | ||
816 | |||
817 | bool in( QChar ch ) const; | ||
818 | #ifndef QT_NO_REGEXP_OPTIM | ||
819 | const QArray<int>& firstOccurrence() const { return occ1; } | ||
820 | #endif | ||
821 | |||
822 | #if defined(QT_DEBUG) | ||
823 | void dump() const; | ||
824 | #endif | ||
825 | |||
826 | private: | ||
827 | /* | ||
828 | The struct Range represents a range of characters (e.g., [0-9] denotes | ||
829 | range 48 to 57). | ||
830 | */ | ||
831 | struct Range | ||
832 | { | ||
833 | ushort from; // 48 | ||
834 | ushort to; // 57 | ||
835 | }; | ||
836 | |||
837 | int c; // character classes | ||
838 | QArray<Range> r; // character ranges | ||
839 | bool n; // negative? | ||
840 | #ifndef QT_NO_REGEXP_OPTIM | ||
841 | QArray<int> occ1; // first-occurrence array | ||
842 | #endif | ||
843 | }; | ||
844 | #else | ||
845 | struct CharClass | ||
846 | { | ||
847 | int x; // dummy | ||
848 | |||
849 | #ifndef QT_NO_REGEXP_OPTIM | ||
850 | const QArray<int>& firstOccurrence() const { | ||
851 | return *firstOccurrenceAtZero; | ||
852 | } | ||
853 | #endif | ||
854 | }; | ||
855 | #endif | ||
856 | |||
857 | QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); } | ||
858 | QRegExpEngine( const QString& rx, bool caseSensitive ); | ||
859 | #ifndef QT_NO_REGEXP_OPTIM | ||
860 | ~QRegExpEngine(); | ||
861 | #endif | ||
862 | |||
863 | bool isValid() const { return valid; } | ||
864 | bool caseSensitive() const { return cs; } | ||
865 | int numCaptures() const { return realncap; } | ||
866 | QArray<int> match( const QString& str, int pos, bool minimal, | ||
867 | bool oneTest ); | ||
868 | int matchedLength() const { return mmMatchedLen; } | ||
869 | |||
870 | int createState( QChar ch ); | ||
871 | int createState( const CharClass& cc ); | ||
872 | #ifndef QT_NO_REGEXP_BACKREF | ||
873 | int createState( int bref ); | ||
874 | #endif | ||
875 | |||
876 | void addCatTransitions( const QArray<int>& from, const QArray<int>& to ); | ||
877 | #ifndef QT_NO_REGEXP_CAPTURE | ||
878 | void addPlusTransitions( const QArray<int>& from, const QArray<int>& to, | ||
879 | int atom ); | ||
880 | #endif | ||
881 | |||
882 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
883 | int anchorAlternation( int a, int b ); | ||
884 | int anchorConcatenation( int a, int b ); | ||
885 | #else | ||
886 | int anchorAlternation( int a, int b ) { return a & b; } | ||
887 | int anchorConcatenation( int a, int b ) { return a | b; } | ||
888 | #endif | ||
889 | void addAnchors( int from, int to, int a ); | ||
890 | |||
891 | #ifndef QT_NO_REGEXP_OPTIM | ||
892 | void setupGoodStringHeuristic( int earlyStart, int lateStart, | ||
893 | const QString& str ); | ||
894 | void setupBadCharHeuristic( int minLen, const QArray<int>& firstOcc ); | ||
895 | void heuristicallyChooseHeuristic(); | ||
896 | #endif | ||
897 | |||
898 | #if defined(QT_DEBUG) | ||
899 | void dump() const; | ||
900 | #endif | ||
901 | |||
902 | private: | ||
903 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; | ||
904 | |||
905 | /* | ||
906 | The struct State represents one state in a modified NFA. The input | ||
907 | characters matched are stored in the state instead of on the transitions, | ||
908 | something possible for an automaton constructed from a regular expression. | ||
909 | */ | ||
910 | struct State | ||
911 | { | ||
912 | #ifndef QT_NO_REGEXP_CAPTURE | ||
913 | int atom; // which atom does this state belong to? | ||
914 | #endif | ||
915 | int match; // what does it match? (see CharClassBit and BackRefBit) | ||
916 | QArray<int> outs; // out-transitions | ||
917 | QMap<int, int> *reenter; // atoms reentered when transiting out | ||
918 | QMap<int, int> *anchors; // anchors met when transiting out | ||
919 | |||
920 | #ifndef QT_NO_REGEXP_CAPTURE | ||
921 | State( int a, int m ) | ||
922 | : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { } | ||
923 | #else | ||
924 | State( int m ) | ||
925 | : match( m ), reenter( 0 ), anchors( 0 ) { } | ||
926 | #endif | ||
927 | ~State() { delete reenter; delete anchors; } | ||
928 | }; | ||
929 | |||
930 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
931 | /* | ||
932 | The struct Lookahead represents a lookahead a la Perl (e.g., (?=foo) and | ||
933 | (?!bar)). | ||
934 | */ | ||
935 | struct Lookahead | ||
936 | { | ||
937 | QRegExpEngine *eng; // NFA representing the embedded regular expression | ||
938 | bool neg; // negative lookahead? | ||
939 | |||
940 | Lookahead( QRegExpEngine *eng0, bool neg0 ) | ||
941 | : eng( eng0 ), neg( neg0 ) { } | ||
942 | ~Lookahead() { delete eng; } | ||
943 | }; | ||
944 | #endif | ||
945 | |||
946 | #ifndef QT_NO_REGEXP_CAPTURE | ||
947 | /* | ||
948 | The struct Atom represents one node in the hierarchy of regular expression | ||
949 | atoms. | ||
950 | */ | ||
951 | struct Atom | ||
952 | { | ||
953 | int parent; // index of parent in array of atoms | ||
954 | int capture; // index of capture, from 1 to ncap | ||
955 | }; | ||
956 | #endif | ||
957 | |||
958 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
959 | /* | ||
960 | The struct AnchorAlternation represents a pair of anchors with OR | ||
961 | semantics. | ||
962 | */ | ||
963 | struct AnchorAlternation | ||
964 | { | ||
965 | int a; // this anchor... | ||
966 | int b; // ...or this one | ||
967 | }; | ||
968 | #endif | ||
969 | |||
970 | enum { InitialState = 0, FinalState = 1 }; | ||
971 | void setup( bool caseSensitive ); | ||
972 | int setupState( int match ); | ||
973 | |||
974 | /* | ||
975 | Let's hope that 13 lookaheads and 14 back-references are enough. | ||
976 | */ | ||
977 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; | ||
978 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, | ||
979 | Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008, | ||
980 | Anchor_FirstLookahead = 0x00000010, | ||
981 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, | ||
982 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, | ||
983 | Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs, | ||
984 | |||
985 | Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^ | ||
986 | ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) }; | ||
987 | #ifndef QT_NO_REGEXP_CAPTURE | ||
988 | int startAtom( bool capture ); | ||
989 | void finishAtom( int atom ) { cf = f[atom].parent; } | ||
990 | #endif | ||
991 | |||
992 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
993 | int addLookahead( QRegExpEngine *eng, bool negative ); | ||
994 | #endif | ||
995 | |||
996 | #ifndef QT_NO_REGEXP_CAPTURE | ||
997 | bool isBetterCapture( const int *begin1, const int *end1, const int *begin2, | ||
998 | const int *end2 ); | ||
999 | #endif | ||
1000 | bool testAnchor( int i, int a, const int *capBegin ); | ||
1001 | |||
1002 | #ifndef QT_NO_REGEXP_OPTIM | ||
1003 | bool goodStringMatch(); | ||
1004 | bool badCharMatch(); | ||
1005 | #else | ||
1006 | bool bruteMatch(); | ||
1007 | #endif | ||
1008 | bool matchHere(); | ||
1009 | |||
1010 | QVector<State> s; // array of states | ||
1011 | int ns; // number of states | ||
1012 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1013 | QArray<Atom> f; // atom hierarchy | ||
1014 | int nf; // number of atoms | ||
1015 | int cf; // current atom | ||
1016 | #endif | ||
1017 | int realncap; // number of captures, seen from the outside | ||
1018 | int ncap; // number of captures, seen from the inside | ||
1019 | #ifndef QT_NO_REGEXP_CCLASS | ||
1020 | QVector<CharClass> cl; // array of character classes | ||
1021 | #endif | ||
1022 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1023 | QVector<Lookahead> ahead; // array of lookaheads | ||
1024 | #endif | ||
1025 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1026 | QArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors | ||
1027 | #endif | ||
1028 | #ifndef QT_NO_REGEXP_OPTIM | ||
1029 | bool caretAnchored; // does the regexp start with ^? | ||
1030 | #endif | ||
1031 | bool valid; // is the regular expression valid? | ||
1032 | bool cs; // case sensitive? | ||
1033 | #ifndef QT_NO_REGEXP_BACKREF | ||
1034 | int nbrefs; // number of back-references | ||
1035 | #endif | ||
1036 | |||
1037 | #ifndef QT_NO_REGEXP_OPTIM | ||
1038 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch | ||
1039 | |||
1040 | int goodEarlyStart; // the index where goodStr can first occur in a match | ||
1041 | int goodLateStart; // the index where goodStr can last occur in a match | ||
1042 | QString goodStr; // the string that any match has to contain | ||
1043 | |||
1044 | int minl; // the minimum length of a match | ||
1045 | QArray<int> occ1; // first-occurrence array | ||
1046 | #endif | ||
1047 | |||
1048 | /* | ||
1049 | The class Box is an abstraction for a regular expression fragment. It can | ||
1050 | also be seen as one node in the syntax tree of a regular expression with | ||
1051 | synthetized attributes. | ||
1052 | |||
1053 | It's interface is ugly for performance reasons. | ||
1054 | */ | ||
1055 | class Box | ||
1056 | { | ||
1057 | public: | ||
1058 | Box( QRegExpEngine *engine ); | ||
1059 | Box( const Box& b ) { operator=( b ); } | ||
1060 | |||
1061 | Box& operator=( const Box& b ); | ||
1062 | |||
1063 | void clear() { operator=(Box(eng)); } | ||
1064 | void set( QChar ch ); | ||
1065 | void set( const CharClass& cc ); | ||
1066 | #ifndef QT_NO_REGEXP_BACKREF | ||
1067 | void set( int bref ); | ||
1068 | #endif | ||
1069 | |||
1070 | void cat( const Box& b ); | ||
1071 | void orx( const Box& b ); | ||
1072 | void plus( int atom ); | ||
1073 | void opt(); | ||
1074 | void catAnchor( int a ); | ||
1075 | #ifndef QT_NO_REGEXP_OPTIM | ||
1076 | void setupHeuristics(); | ||
1077 | #endif | ||
1078 | |||
1079 | #if defined(QT_DEBUG) | ||
1080 | void dump() const; | ||
1081 | #endif | ||
1082 | |||
1083 | private: | ||
1084 | void addAnchorsToEngine( const Box& to ) const; | ||
1085 | |||
1086 | QRegExpEngine *eng; // the automaton under construction | ||
1087 | QArray<int> ls; // the left states (firstpos) | ||
1088 | QArray<int> rs; // the right states (lastpos) | ||
1089 | QMap<int, int> lanchors; // the left anchors | ||
1090 | QMap<int, int> ranchors; // the right anchors | ||
1091 | int skipanchors; // the anchors to match if the box is skipped | ||
1092 | |||
1093 | #ifndef QT_NO_REGEXP_OPTIM | ||
1094 | int earlyStart; // the index where str can first occur | ||
1095 | int lateStart; // the index where str can last occur | ||
1096 | QString str; // a string that has to occur in any match | ||
1097 | QString leftStr; // a string occurring at the left of this box | ||
1098 | QString rightStr; // a string occurring at the right of this box | ||
1099 | int maxl; // the maximum length of this box (possibly InftyLen) | ||
1100 | #endif | ||
1101 | |||
1102 | int minl; // the minimum length of this box | ||
1103 | #ifndef QT_NO_REGEXP_OPTIM | ||
1104 | QArray<int> occ1; // first-occurrence array | ||
1105 | #endif | ||
1106 | }; | ||
1107 | friend class Box; | ||
1108 | |||
1109 | /* | ||
1110 | This is the lexical analyzer for regular expressions. | ||
1111 | */ | ||
1112 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, | ||
1113 | Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass, | ||
1114 | Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord, | ||
1115 | Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; | ||
1116 | int getChar(); | ||
1117 | int getEscape(); | ||
1118 | #ifndef QT_NO_REGEXP_INTERVAL | ||
1119 | int getRep( int def ); | ||
1120 | #endif | ||
1121 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1122 | void skipChars( int n ); | ||
1123 | #endif | ||
1124 | void startTokenizer( const QChar *rx, int len ); | ||
1125 | int getToken(); | ||
1126 | |||
1127 | const QChar *yyIn; // a pointer to the input regular expression pattern | ||
1128 | int yyPos0; // the position of yyTok in the input pattern | ||
1129 | int yyPos; // the position of the next character to read | ||
1130 | int yyLen; // the length of yyIn | ||
1131 | int yyCh; // the last character read | ||
1132 | CharClass *yyCharClass; // attribute for Tok_CharClass tokens | ||
1133 | int yyMinRep; // attribute for Tok_Quantifier | ||
1134 | int yyMaxRep; // ditto | ||
1135 | bool yyError; // syntax error or overflow during parsing? | ||
1136 | |||
1137 | /* | ||
1138 | This is the syntactic analyzer for regular expressions. | ||
1139 | */ | ||
1140 | int parse( const QChar *rx, int len ); | ||
1141 | void parseAtom( Box *box ); | ||
1142 | void parseFactor( Box *box ); | ||
1143 | void parseTerm( Box *box ); | ||
1144 | void parseExpression( Box *box ); | ||
1145 | |||
1146 | int yyTok; // the last token read | ||
1147 | bool yyMayCapture; // set this to FALSE to disable capturing | ||
1148 | |||
1149 | /* | ||
1150 | This is the engine state during matching. | ||
1151 | */ | ||
1152 | const QString *mmStr; // a pointer to the input QString | ||
1153 | const QChar *mmIn; // a pointer to the input string data | ||
1154 | int mmPos; // the current position in the string | ||
1155 | int mmLen; // the length of the input string | ||
1156 | bool mmMinimal; // minimal matching? | ||
1157 | QArray<int> mmCaptured; // an array of pairs (start, len) | ||
1158 | QArray<int> mmCapturedNoMatch; // an array of pairs (-1, -1) | ||
1159 | QArray<int> mmBigArray; // big QArray<int> array | ||
1160 | int *mmInNextStack; // is state is mmNextStack? | ||
1161 | int *mmCurStack; // stack of current states | ||
1162 | int *mmNextStack; // stack of next states | ||
1163 | int *mmCurCapBegin; // start of current states' captures | ||
1164 | int *mmNextCapBegin; // start of next states' captures | ||
1165 | int *mmCurCapEnd; // end of current states' captures | ||
1166 | int *mmNextCapEnd; // end of next states' captures | ||
1167 | int *mmTempCapBegin; // start of temporary captures | ||
1168 | int *mmTempCapEnd; // end of temporary captures | ||
1169 | int *mmCapBegin; // start of captures for a next state | ||
1170 | int *mmCapEnd; // end of captures for a next state | ||
1171 | int *mmSlideTab; // bump-along slide table for bad-character heuristic | ||
1172 | int mmSlideTabSize; // size of slide table | ||
1173 | #ifndef QT_NO_REGEXP_BACKREF | ||
1174 | QIntDict<int> mmSleeping; // dictionary of back-reference sleepers | ||
1175 | #endif | ||
1176 | int mmMatchedLen; // length of match or of matched string for partial match | ||
1177 | }; | ||
1178 | |||
1179 | QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive ) | ||
1180 | #ifndef QT_NO_REGEXP_BACKREF | ||
1181 | : mmSleeping( 101 ) | ||
1182 | #endif | ||
1183 | { | ||
1184 | setup( caseSensitive ); | ||
1185 | valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() ); | ||
1186 | } | ||
1187 | |||
1188 | #ifndef QT_NO_REGEXP_OPTIM | ||
1189 | QRegExpEngine::~QRegExpEngine() | ||
1190 | { | ||
1191 | if ( --engCount == 0 ) { | ||
1192 | delete noOccurrences; | ||
1193 | noOccurrences = 0; | ||
1194 | delete firstOccurrenceAtZero; | ||
1195 | firstOccurrenceAtZero = 0; | ||
1196 | } | ||
1197 | } | ||
1198 | #endif | ||
1199 | |||
1200 | /* | ||
1201 | Tries to match in str and returns an array of (begin, length) pairs for | ||
1202 | captured text. If there is no match, all pairs are (-1, -1). | ||
1203 | */ | ||
1204 | QArray<int> QRegExpEngine::match( const QString& str, int pos, bool minimal, | ||
1205 | bool oneTest ) | ||
1206 | { | ||
1207 | mmStr = &str; | ||
1208 | mmIn = str.unicode(); | ||
1209 | if ( mmIn == 0 ) | ||
1210 | mmIn = &QChar::null; | ||
1211 | mmPos = pos; | ||
1212 | mmLen = str.length(); | ||
1213 | mmMinimal = minimal; | ||
1214 | mmMatchedLen = 0; | ||
1215 | |||
1216 | bool matched = FALSE; | ||
1217 | if ( valid && mmPos >= 0 && mmPos <= mmLen ) { | ||
1218 | #ifndef QT_NO_REGEXP_OPTIM | ||
1219 | if ( mmPos <= mmLen - minl ) { | ||
1220 | if ( caretAnchored || oneTest ) | ||
1221 | matched = matchHere(); | ||
1222 | else if ( useGoodStringHeuristic ) | ||
1223 | matched = goodStringMatch(); | ||
1224 | else | ||
1225 | matched = badCharMatch(); | ||
1226 | } | ||
1227 | #else | ||
1228 | matched = oneTest ? matchHere() : bruteMatch(); | ||
1229 | #endif | ||
1230 | } | ||
1231 | |||
1232 | if ( matched ) { | ||
1233 | mmCaptured.detach(); | ||
1234 | mmCaptured[0] = mmPos; | ||
1235 | mmCaptured[1] = mmMatchedLen; | ||
1236 | for ( int j = 0; j < realncap; j++ ) { | ||
1237 | int len = mmCapEnd[j] - mmCapBegin[j]; | ||
1238 | mmCaptured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0; | ||
1239 | mmCaptured[2 + 2 * j + 1] = len; | ||
1240 | } | ||
1241 | return mmCaptured; | ||
1242 | } else { | ||
1243 | return mmCapturedNoMatch; | ||
1244 | } | ||
1245 | } | ||
1246 | |||
1247 | /* | ||
1248 | The three following functions add one state to the automaton and return the | ||
1249 | number of the state. | ||
1250 | */ | ||
1251 | |||
1252 | int QRegExpEngine::createState( QChar ch ) | ||
1253 | { | ||
1254 | return setupState( ch.unicode() ); | ||
1255 | } | ||
1256 | |||
1257 | int QRegExpEngine::createState( const CharClass& cc ) | ||
1258 | { | ||
1259 | #ifndef QT_NO_REGEXP_CCLASS | ||
1260 | int n = cl.size(); | ||
1261 | cl.resize( n + 1 ); | ||
1262 | cl.insert( n, new CharClass(cc) ); | ||
1263 | return setupState( CharClassBit | n ); | ||
1264 | #else | ||
1265 | Q_UNUSED( cc ); | ||
1266 | return setupState( CharClassBit ); | ||
1267 | #endif | ||
1268 | } | ||
1269 | |||
1270 | #ifndef QT_NO_REGEXP_BACKREF | ||
1271 | int QRegExpEngine::createState( int bref ) | ||
1272 | { | ||
1273 | if ( bref > nbrefs ) { | ||
1274 | nbrefs = bref; | ||
1275 | if ( nbrefs > MaxBackRefs ) { | ||
1276 | yyError = TRUE; | ||
1277 | return 0; | ||
1278 | } | ||
1279 | } | ||
1280 | return setupState( BackRefBit | bref ); | ||
1281 | } | ||
1282 | #endif | ||
1283 | |||
1284 | /* | ||
1285 | The two following functions add a transition between all pairs of states | ||
1286 | (i, j) where i is fond in from, and j is found in to. | ||
1287 | |||
1288 | Cat-transitions are distinguished from plus-transitions for capturing. | ||
1289 | */ | ||
1290 | |||
1291 | void QRegExpEngine::addCatTransitions( const QArray<int>& from, | ||
1292 | const QArray<int>& to ) | ||
1293 | { | ||
1294 | for ( int i = 0; i < (int) from.size(); i++ ) { | ||
1295 | State *st = s[from[i]]; | ||
1296 | mergeInto( &st->outs, to ); | ||
1297 | } | ||
1298 | } | ||
1299 | |||
1300 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1301 | void QRegExpEngine::addPlusTransitions( const QArray<int>& from, | ||
1302 | const QArray<int>& to, int atom ) | ||
1303 | { | ||
1304 | for ( int i = 0; i < (int) from.size(); i++ ) { | ||
1305 | State *st = s[from[i]]; | ||
1306 | QArray<int> oldOuts = st->outs.copy(); | ||
1307 | mergeInto( &st->outs, to ); | ||
1308 | if ( f[atom].capture >= 0 ) { | ||
1309 | if ( st->reenter == 0 ) | ||
1310 | st->reenter = new QMap<int, int>; | ||
1311 | for ( int j = 0; j < (int) to.size(); j++ ) { | ||
1312 | if ( !st->reenter->contains(to[j]) && | ||
1313 | oldOuts.bsearch(to[j]) < 0 ) | ||
1314 | st->reenter->insert( to[j], atom ); | ||
1315 | } | ||
1316 | } | ||
1317 | } | ||
1318 | } | ||
1319 | #endif | ||
1320 | |||
1321 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1322 | /* | ||
1323 | Returns an anchor that means a OR b. | ||
1324 | */ | ||
1325 | int QRegExpEngine::anchorAlternation( int a, int b ) | ||
1326 | { | ||
1327 | if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 ) | ||
1328 | return a & b; | ||
1329 | |||
1330 | int n = aa.size(); | ||
1331 | aa.resize( n + 1 ); | ||
1332 | aa[n].a = a; | ||
1333 | aa[n].b = b; | ||
1334 | return Anchor_Alternation | n; | ||
1335 | } | ||
1336 | |||
1337 | /* | ||
1338 | Returns an anchor that means a AND b. | ||
1339 | */ | ||
1340 | int QRegExpEngine::anchorConcatenation( int a, int b ) | ||
1341 | { | ||
1342 | if ( ((a | b) & Anchor_Alternation) == 0 ) | ||
1343 | return a | b; | ||
1344 | if ( (b & Anchor_Alternation) != 0 ) | ||
1345 | qSwap( a, b ); | ||
1346 | int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b ); | ||
1347 | int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b ); | ||
1348 | return anchorAlternation( aprime, bprime ); | ||
1349 | } | ||
1350 | #endif | ||
1351 | |||
1352 | /* | ||
1353 | Adds anchor a on a transition caracterised by its from state and its to state. | ||
1354 | */ | ||
1355 | void QRegExpEngine::addAnchors( int from, int to, int a ) | ||
1356 | { | ||
1357 | State *st = s[from]; | ||
1358 | if ( st->anchors == 0 ) | ||
1359 | st->anchors = new QMap<int, int>; | ||
1360 | if ( st->anchors->contains(to) ) | ||
1361 | a = anchorAlternation( (*st->anchors)[to], a ); | ||
1362 | st->anchors->insert( to, a ); | ||
1363 | } | ||
1364 | |||
1365 | #ifndef QT_NO_REGEXP_OPTIM | ||
1366 | /* | ||
1367 | The two following functions provide the engine with the information needed by | ||
1368 | its matching heuristics. | ||
1369 | */ | ||
1370 | |||
1371 | void QRegExpEngine::setupGoodStringHeuristic( int earlyStart, int lateStart, | ||
1372 | const QString& str ) | ||
1373 | { | ||
1374 | goodEarlyStart = earlyStart; | ||
1375 | goodLateStart = lateStart; | ||
1376 | goodStr = cs ? str : str.lower(); | ||
1377 | } | ||
1378 | |||
1379 | void QRegExpEngine::setupBadCharHeuristic( int minLen, | ||
1380 | const QArray<int>& firstOcc ) | ||
1381 | { | ||
1382 | minl = minLen; | ||
1383 | occ1 = cs ? firstOcc : *firstOccurrenceAtZero; | ||
1384 | } | ||
1385 | |||
1386 | /* | ||
1387 | This function chooses between the good-string and the bad-character | ||
1388 | heuristics. It computes two scores and chooses the heuristic with the highest | ||
1389 | score. | ||
1390 | |||
1391 | Here are some common-sense constraints on the scores that should be respected | ||
1392 | if the formulas are ever modified: (1) If goodStr is empty, the good-string | ||
1393 | heuristic scores 0. (2) If the search is case insensitive, the good-string | ||
1394 | heuristic should be used, unless it scores 0. (Case insensitivity | ||
1395 | turns all entries of occ1 to 0.) (3) If (goodLateStart - goodEarlyStart) is | ||
1396 | big, the good-string heuristic should score less. | ||
1397 | */ | ||
1398 | void QRegExpEngine::heuristicallyChooseHeuristic() | ||
1399 | { | ||
1400 | int i; | ||
1401 | |||
1402 | if ( minl == 0 ) | ||
1403 | return; | ||
1404 | |||
1405 | /* | ||
1406 | Magic formula: The good string has to constitute a good proportion of the | ||
1407 | minimum-length string, and appear at a more-or-less known index. | ||
1408 | */ | ||
1409 | int goodStringScore = ( 64 * goodStr.length() / minl ) - | ||
1410 | ( goodLateStart - goodEarlyStart ); | ||
1411 | |||
1412 | /* | ||
1413 | Less magic formula: We pick a couple of characters at random, and check | ||
1414 | whether they are good or bad. | ||
1415 | */ | ||
1416 | int badCharScore = 0; | ||
1417 | int step = QMAX( 1, NumBadChars / 32 ); | ||
1418 | for ( i = 1; i < NumBadChars; i += step ) { | ||
1419 | if ( occ1[i] == NoOccurrence ) | ||
1420 | badCharScore += minl; | ||
1421 | else | ||
1422 | badCharScore += occ1[i]; | ||
1423 | } | ||
1424 | badCharScore /= minl; | ||
1425 | |||
1426 | useGoodStringHeuristic = ( goodStringScore > badCharScore ); | ||
1427 | } | ||
1428 | #endif | ||
1429 | |||
1430 | #if defined(QT_DEBUG) | ||
1431 | void QRegExpEngine::dump() const | ||
1432 | { | ||
1433 | int i, j; | ||
1434 | qDebug( "Case %ssensitive engine", cs ? "" : "in" ); | ||
1435 | qDebug( " States" ); | ||
1436 | for ( i = 0; i < ns; i++ ) { | ||
1437 | qDebug( " %d%s", i, | ||
1438 | i == InitialState ? " (initial)" : | ||
1439 | i == FinalState ? " (final)" : "" ); | ||
1440 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1441 | qDebug( " in atom %d", s[i]->atom ); | ||
1442 | #endif | ||
1443 | int m = s[i]->match; | ||
1444 | if ( (m & CharClassBit) != 0 ) { | ||
1445 | qDebug( " match character class %d", m ^ CharClassBit ); | ||
1446 | #ifndef QT_NO_REGEXP_CCLASS | ||
1447 | cl[m ^ CharClassBit]->dump(); | ||
1448 | #else | ||
1449 | qDebug( " negative character class" ); | ||
1450 | #endif | ||
1451 | } else if ( (m & BackRefBit) != 0 ) { | ||
1452 | qDebug( " match back-reference %d", m ^ BackRefBit ); | ||
1453 | } else if ( m >= 0x20 && m <= 0x7e ) { | ||
1454 | qDebug( " match 0x%.4x (%c)", m, m ); | ||
1455 | } else { | ||
1456 | qDebug( " match 0x%.4x", m ); | ||
1457 | } | ||
1458 | for ( j = 0; j < (int) s[i]->outs.size(); j++ ) { | ||
1459 | int next = s[i]->outs[j]; | ||
1460 | qDebug( " -> %d", next ); | ||
1461 | if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) ) | ||
1462 | qDebug( " [reenter %d]", (*s[i]->reenter)[next] ); | ||
1463 | if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 ) | ||
1464 | qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] ); | ||
1465 | } | ||
1466 | } | ||
1467 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1468 | if ( nf > 0 ) { | ||
1469 | qDebug( " Atom Parent Capture" ); | ||
1470 | for ( i = 0; i < nf; i++ ) | ||
1471 | qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture ); | ||
1472 | } | ||
1473 | #endif | ||
1474 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1475 | for ( i = 0; i < (int) aa.size(); i++ ) | ||
1476 | qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, | ||
1477 | aa[i].b ); | ||
1478 | #endif | ||
1479 | } | ||
1480 | #endif | ||
1481 | |||
1482 | void QRegExpEngine::setup( bool caseSensitive ) | ||
1483 | { | ||
1484 | #ifndef QT_NO_REGEXP_OPTIM | ||
1485 | if ( engCount++ == 0 ) { | ||
1486 | noOccurrences = new QArray<int>( NumBadChars ); | ||
1487 | firstOccurrenceAtZero = new QArray<int>( NumBadChars ); | ||
1488 | noOccurrences->fill( NoOccurrence ); | ||
1489 | firstOccurrenceAtZero->fill( 0 ); | ||
1490 | } | ||
1491 | #endif | ||
1492 | s.setAutoDelete( TRUE ); | ||
1493 | s.resize( 32 ); | ||
1494 | ns = 0; | ||
1495 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1496 | f.resize( 32 ); | ||
1497 | nf = 0; | ||
1498 | cf = -1; | ||
1499 | #endif | ||
1500 | realncap = 0; | ||
1501 | ncap = 0; | ||
1502 | #ifndef QT_NO_REGEXP_CCLASS | ||
1503 | cl.setAutoDelete( TRUE ); | ||
1504 | #endif | ||
1505 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1506 | ahead.setAutoDelete( TRUE ); | ||
1507 | #endif | ||
1508 | #ifndef QT_NO_REGEXP_OPTIM | ||
1509 | caretAnchored = TRUE; | ||
1510 | #endif | ||
1511 | valid = FALSE; | ||
1512 | cs = caseSensitive; | ||
1513 | #ifndef QT_NO_REGEXP_BACKREF | ||
1514 | nbrefs = 0; | ||
1515 | #endif | ||
1516 | #ifndef QT_NO_REGEXP_OPTIM | ||
1517 | useGoodStringHeuristic = FALSE; | ||
1518 | minl = 0; | ||
1519 | occ1 = *firstOccurrenceAtZero; | ||
1520 | #endif | ||
1521 | mmCapturedNoMatch.fill( -1, 2 ); | ||
1522 | } | ||
1523 | |||
1524 | int QRegExpEngine::setupState( int match ) | ||
1525 | { | ||
1526 | if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() ) | ||
1527 | s.resize( (ns + 1) << 1 ); | ||
1528 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1529 | s.insert( ns, new State(cf, match) ); | ||
1530 | #else | ||
1531 | s.insert( ns, new State(match) ); | ||
1532 | #endif | ||
1533 | return ns++; | ||
1534 | } | ||
1535 | |||
1536 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1537 | /* | ||
1538 | Functions startAtom() and finishAtom() should be called to delimit atoms. | ||
1539 | When a state is created, it is assigned to the current atom. The information | ||
1540 | is later used for capturing. | ||
1541 | */ | ||
1542 | int QRegExpEngine::startAtom( bool capture ) | ||
1543 | { | ||
1544 | if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() ) | ||
1545 | f.resize( (nf + 1) << 1 ); | ||
1546 | f[nf].parent = cf; | ||
1547 | cf = nf++; | ||
1548 | f[cf].capture = capture ? ncap++ : -1; | ||
1549 | return cf; | ||
1550 | } | ||
1551 | #endif | ||
1552 | |||
1553 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1554 | /* | ||
1555 | Creates a lookahead anchor. | ||
1556 | */ | ||
1557 | int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative ) | ||
1558 | { | ||
1559 | int n = ahead.size(); | ||
1560 | if ( n == MaxLookaheads ) { | ||
1561 | yyError = TRUE; | ||
1562 | return 0; | ||
1563 | } | ||
1564 | ahead.resize( n + 1 ); | ||
1565 | ahead.insert( n, new Lookahead(eng, negative) ); | ||
1566 | return Anchor_FirstLookahead << n; | ||
1567 | } | ||
1568 | #endif | ||
1569 | |||
1570 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1571 | /* | ||
1572 | We want the longest leftmost captures. | ||
1573 | */ | ||
1574 | bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1, | ||
1575 | const int *begin2, const int *end2 ) | ||
1576 | { | ||
1577 | for ( int i = 0; i < ncap; i++ ) { | ||
1578 | int delta = begin2[i] - begin1[i]; // it has to start early... | ||
1579 | if ( delta == 0 ) | ||
1580 | delta = end1[i] - end2[i]; // ...and end late (like a party) | ||
1581 | |||
1582 | if ( delta != 0 ) | ||
1583 | return delta > 0; | ||
1584 | } | ||
1585 | return FALSE; | ||
1586 | } | ||
1587 | #endif | ||
1588 | |||
1589 | /* | ||
1590 | Returns TRUE if anchor a matches at position mmPos + i in the input string, | ||
1591 | otherwise FALSE. | ||
1592 | */ | ||
1593 | bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin ) | ||
1594 | { | ||
1595 | int j; | ||
1596 | |||
1597 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
1598 | if ( (a & Anchor_Alternation) != 0 ) { | ||
1599 | return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) || | ||
1600 | testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin ); | ||
1601 | } | ||
1602 | #endif | ||
1603 | |||
1604 | if ( (a & Anchor_Caret) != 0 ) { | ||
1605 | if ( mmPos + i != 0 ) | ||
1606 | return FALSE; | ||
1607 | } | ||
1608 | if ( (a & Anchor_Dollar) != 0 ) { | ||
1609 | if ( mmPos + i != mmLen ) | ||
1610 | return FALSE; | ||
1611 | } | ||
1612 | #ifndef QT_NO_REGEXP_ESCAPE | ||
1613 | if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) { | ||
1614 | bool before = FALSE, after = FALSE; | ||
1615 | if ( mmPos + i != 0 ) | ||
1616 | before = mmIn[mmPos + i - 1].isLetterOrNumber(); | ||
1617 | if ( mmPos + i != mmLen ) | ||
1618 | after = mmIn[mmPos + i].isLetterOrNumber(); | ||
1619 | if ( (a & Anchor_Word) != 0 && (before == after) ) | ||
1620 | return FALSE; | ||
1621 | if ( (a & Anchor_NonWord) != 0 && (before != after) ) | ||
1622 | return FALSE; | ||
1623 | } | ||
1624 | #endif | ||
1625 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
1626 | bool catchx = TRUE; | ||
1627 | |||
1628 | if ( (a & Anchor_LookaheadMask) != 0 ) { | ||
1629 | QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i, | ||
1630 | mmLen - mmPos - i ); | ||
1631 | for ( j = 0; j < (int) ahead.size(); j++ ) { | ||
1632 | if ( (a & (Anchor_FirstLookahead << j)) != 0 ) { | ||
1633 | catchx = ( ahead[j]->eng->match(cstr.string(), 0, TRUE, | ||
1634 | TRUE)[0] == 0 ); | ||
1635 | if ( catchx == ahead[j]->neg ) | ||
1636 | return FALSE; | ||
1637 | } | ||
1638 | } | ||
1639 | } | ||
1640 | #endif | ||
1641 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1642 | #ifndef QT_NO_REGEXP_BACKREF | ||
1643 | for ( j = 0; j < nbrefs; j++ ) { | ||
1644 | if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) { | ||
1645 | if ( capBegin[j] != EmptyCapture ) | ||
1646 | return FALSE; | ||
1647 | } | ||
1648 | } | ||
1649 | #endif | ||
1650 | #endif | ||
1651 | return TRUE; | ||
1652 | } | ||
1653 | |||
1654 | #ifndef QT_NO_REGEXP_OPTIM | ||
1655 | /* | ||
1656 | The three following functions are what Jeffrey Friedl would call transmissions | ||
1657 | (or bump-alongs). Using one or the other should make no difference, except | ||
1658 | in performance. | ||
1659 | */ | ||
1660 | |||
1661 | bool QRegExpEngine::goodStringMatch() | ||
1662 | { | ||
1663 | int k = mmPos + goodEarlyStart; | ||
1664 | |||
1665 | while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) { | ||
1666 | int from = k - goodLateStart; | ||
1667 | int to = k - goodEarlyStart; | ||
1668 | if ( from > mmPos ) | ||
1669 | mmPos = from; | ||
1670 | |||
1671 | while ( mmPos <= to ) { | ||
1672 | if ( matchHere() ) | ||
1673 | return TRUE; | ||
1674 | mmPos++; | ||
1675 | } | ||
1676 | k++; | ||
1677 | } | ||
1678 | return FALSE; | ||
1679 | } | ||
1680 | |||
1681 | bool QRegExpEngine::badCharMatch() | ||
1682 | { | ||
1683 | int slideHead = 0; | ||
1684 | int slideNext = 0; | ||
1685 | int i; | ||
1686 | int lastPos = mmLen - minl; | ||
1687 | memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) ); | ||
1688 | |||
1689 | /* | ||
1690 | Set up the slide table, used for the bad-character heuristic, using | ||
1691 | the table of first occurrence of each character. | ||
1692 | */ | ||
1693 | for ( i = 0; i < minl; i++ ) { | ||
1694 | int sk = occ1[BadChar(mmIn[mmPos + i])]; | ||
1695 | if ( sk == NoOccurrence ) | ||
1696 | sk = i + 1; | ||
1697 | if ( sk > 0 ) { | ||
1698 | int k = i + 1 - sk; | ||
1699 | if ( k < 0 ) { | ||
1700 | sk = i + 1; | ||
1701 | k = 0; | ||
1702 | } | ||
1703 | if ( sk > mmSlideTab[k] ) | ||
1704 | mmSlideTab[k] = sk; | ||
1705 | } | ||
1706 | } | ||
1707 | |||
1708 | if ( mmPos > lastPos ) | ||
1709 | return FALSE; | ||
1710 | |||
1711 | while ( TRUE ) { | ||
1712 | if ( ++slideNext >= mmSlideTabSize ) | ||
1713 | slideNext = 0; | ||
1714 | if ( mmSlideTab[slideHead] > 0 ) { | ||
1715 | if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] ) | ||
1716 | mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1; | ||
1717 | mmSlideTab[slideHead] = 0; | ||
1718 | } else { | ||
1719 | if ( matchHere() ) | ||
1720 | return TRUE; | ||
1721 | } | ||
1722 | |||
1723 | if ( mmPos == lastPos ) | ||
1724 | break; | ||
1725 | |||
1726 | /* | ||
1727 | Update the slide table. This code has much in common with the | ||
1728 | initialization code. | ||
1729 | */ | ||
1730 | int sk = occ1[BadChar(mmIn[mmPos + minl])]; | ||
1731 | if ( sk == NoOccurrence ) { | ||
1732 | mmSlideTab[slideNext] = minl; | ||
1733 | } else if ( sk > 0 ) { | ||
1734 | int k = slideNext + minl - sk; | ||
1735 | if ( k >= mmSlideTabSize ) | ||
1736 | k -= mmSlideTabSize; | ||
1737 | if ( sk > mmSlideTab[k] ) | ||
1738 | mmSlideTab[k] = sk; | ||
1739 | } | ||
1740 | slideHead = slideNext; | ||
1741 | mmPos++; | ||
1742 | } | ||
1743 | return FALSE; | ||
1744 | } | ||
1745 | #else | ||
1746 | bool QRegExpEngine::bruteMatch() | ||
1747 | { | ||
1748 | while ( mmPos <= mmLen ) { | ||
1749 | if ( matchHere() ) | ||
1750 | return TRUE; | ||
1751 | mmPos++; | ||
1752 | } | ||
1753 | return FALSE; | ||
1754 | } | ||
1755 | #endif | ||
1756 | |||
1757 | /* | ||
1758 | Here's the core of the engine. It tries to do a match here and now. | ||
1759 | */ | ||
1760 | bool QRegExpEngine::matchHere() | ||
1761 | { | ||
1762 | int ncur = 1, nnext = 0; | ||
1763 | int i = 0, j, k, m; | ||
1764 | bool match = FALSE; | ||
1765 | |||
1766 | mmMatchedLen = -1; | ||
1767 | mmCurStack[0] = InitialState; | ||
1768 | |||
1769 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1770 | if ( ncap > 0 ) { | ||
1771 | for ( j = 0; j < ncap; j++ ) { | ||
1772 | mmCurCapBegin[j] = EmptyCapture; | ||
1773 | mmCurCapEnd[j] = EmptyCapture; | ||
1774 | } | ||
1775 | } | ||
1776 | #endif | ||
1777 | |||
1778 | #ifndef QT_NO_REGEXP_BACKREF | ||
1779 | int *zzZ = 0; | ||
1780 | |||
1781 | while ( (ncur > 0 || mmSleeping.count() > 0) && i <= mmLen - mmPos && | ||
1782 | !match ) | ||
1783 | #else | ||
1784 | while ( ncur > 0 && i <= mmLen - mmPos && !match ) | ||
1785 | #endif | ||
1786 | { | ||
1787 | int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0; | ||
1788 | for ( j = 0; j < ncur; j++ ) { | ||
1789 | int cur = mmCurStack[j]; | ||
1790 | State *scur = s[cur]; | ||
1791 | QArray<int>& outs = scur->outs; | ||
1792 | for ( k = 0; k < (int) outs.size(); k++ ) { | ||
1793 | int next = outs[k]; | ||
1794 | State *snext = s[next]; | ||
1795 | bool in = TRUE; | ||
1796 | #ifndef QT_NO_REGEXP_BACKREF | ||
1797 | int needSomeSleep = 0; | ||
1798 | #endif | ||
1799 | |||
1800 | /* | ||
1801 | First, check if the anchors are anchored properly. | ||
1802 | */ | ||
1803 | if ( scur->anchors != 0 ) { | ||
1804 | int a = at( *scur->anchors, next ); | ||
1805 | if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) ) | ||
1806 | in = FALSE; | ||
1807 | } | ||
1808 | /* | ||
1809 | If indeed they are, check if the input character is correct | ||
1810 | for this transition. | ||
1811 | */ | ||
1812 | if ( in ) { | ||
1813 | m = snext->match; | ||
1814 | if ( (m & (CharClassBit | BackRefBit)) == 0 ) { | ||
1815 | if ( cs ) | ||
1816 | in = ( m == ch ); | ||
1817 | else | ||
1818 | in = ( QChar(m).lower() == QChar(ch).lower() ); | ||
1819 | } else if ( next == FinalState ) { | ||
1820 | mmMatchedLen = i; | ||
1821 | match = mmMinimal; | ||
1822 | in = TRUE; | ||
1823 | } else if ( (m & CharClassBit) != 0 ) { | ||
1824 | #ifndef QT_NO_REGEXP_CCLASS | ||
1825 | const CharClass *cc = cl[m ^ CharClassBit]; | ||
1826 | if ( cs ) | ||
1827 | in = cc->in( ch ); | ||
1828 | else if ( cc->negative() ) | ||
1829 | in = cc->in( QChar(ch).lower() ) && | ||
1830 | cc->in( QChar(ch).upper() ); | ||
1831 | else | ||
1832 | in = cc->in( QChar(ch).lower() ) || | ||
1833 | cc->in( QChar(ch).upper() ); | ||
1834 | #endif | ||
1835 | #ifndef QT_NO_REGEXP_BACKREF | ||
1836 | } else { /* ( (m & BackRefBit) != 0 ) */ | ||
1837 | int bref = m ^ BackRefBit; | ||
1838 | int ell = j * ncap + ( bref - 1 ); | ||
1839 | |||
1840 | in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture; | ||
1841 | if ( in ) { | ||
1842 | if ( cs ) | ||
1843 | in = ( mmIn[mmPos + mmCurCapBegin[ell]] | ||
1844 | == QChar(ch) ); | ||
1845 | else | ||
1846 | in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower() | ||
1847 | == QChar(ch).lower() ); | ||
1848 | } | ||
1849 | |||
1850 | if ( in ) { | ||
1851 | int delta; | ||
1852 | if ( mmCurCapEnd[ell] == EmptyCapture ) | ||
1853 | delta = i - mmCurCapBegin[ell]; | ||
1854 | else | ||
1855 | delta = mmCurCapEnd[ell] - mmCurCapBegin[ell]; | ||
1856 | |||
1857 | in = ( delta <= mmLen - mmPos ); | ||
1858 | if ( in && delta > 1 ) { | ||
1859 | int n; | ||
1860 | if ( cs ) { | ||
1861 | for ( n = 1; n < delta; n++ ) { | ||
1862 | if ( mmIn[mmPos + | ||
1863 | mmCurCapBegin[ell] + n] != | ||
1864 | mmIn[mmPos + i + n] ) | ||
1865 | break; | ||
1866 | } | ||
1867 | } else { | ||
1868 | for ( n = 1; n < delta; n++ ) { | ||
1869 | QChar a = mmIn[mmPos + | ||
1870 | mmCurCapBegin[ell] + n]; | ||
1871 | QChar b = mmIn[mmPos + i + n]; | ||
1872 | if ( a.lower() != b.lower() ) | ||
1873 | break; | ||
1874 | } | ||
1875 | } | ||
1876 | in = ( n == delta ); | ||
1877 | if ( in ) | ||
1878 | needSomeSleep = delta - 1; | ||
1879 | } | ||
1880 | } | ||
1881 | #endif | ||
1882 | } | ||
1883 | } | ||
1884 | |||
1885 | /* | ||
1886 | All is right. We must now update our data structures. | ||
1887 | */ | ||
1888 | if ( in ) { | ||
1889 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1890 | int *capBegin, *capEnd; | ||
1891 | #endif | ||
1892 | /* | ||
1893 | If the next state was not encountered yet, all is fine. | ||
1894 | */ | ||
1895 | if ( (m = mmInNextStack[next]) == -1 ) { | ||
1896 | m = nnext++; | ||
1897 | mmNextStack[m] = next; | ||
1898 | mmInNextStack[next] = m; | ||
1899 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1900 | capBegin = mmNextCapBegin + m * ncap; | ||
1901 | capEnd = mmNextCapEnd + m * ncap; | ||
1902 | |||
1903 | /* | ||
1904 | Otherwise, we'll first maintain captures in temporary | ||
1905 | arrays, and decide at the end whether it's best to keep | ||
1906 | the previous capture zones or the new ones. | ||
1907 | */ | ||
1908 | } else { | ||
1909 | capBegin = mmTempCapBegin; | ||
1910 | capEnd = mmTempCapEnd; | ||
1911 | #endif | ||
1912 | } | ||
1913 | |||
1914 | #ifndef QT_NO_REGEXP_CAPTURE | ||
1915 | /* | ||
1916 | Updating the capture zones is much of a task. | ||
1917 | */ | ||
1918 | if ( ncap > 0 ) { | ||
1919 | memcpy( capBegin, mmCurCapBegin + j * ncap, | ||
1920 | ncap * sizeof(int) ); | ||
1921 | memcpy( capEnd, mmCurCapEnd + j * ncap, | ||
1922 | ncap * sizeof(int) ); | ||
1923 | int c = scur->atom, n = snext->atom; | ||
1924 | int p = -1, q = -1; | ||
1925 | int cap; | ||
1926 | |||
1927 | /* | ||
1928 | Lemma 1. For any x in the range [0..nf), we have | ||
1929 | f[x].parent < x. | ||
1930 | |||
1931 | Proof. By looking at startAtom(), it is clear that | ||
1932 | cf < nf holds all the time, and thus that | ||
1933 | f[nf].parent < nf. | ||
1934 | */ | ||
1935 | |||
1936 | /* | ||
1937 | If we are reentering an atom, we empty all capture | ||
1938 | zones inside it. | ||
1939 | */ | ||
1940 | if ( scur->reenter != 0 && | ||
1941 | (q = at(*scur->reenter, next)) != 0 ) { | ||
1942 | QBitArray b; | ||
1943 | b.fill( FALSE, nf ); | ||
1944 | b.setBit( q, TRUE ); | ||
1945 | for ( int ell = q + 1; ell < nf; ell++ ) { | ||
1946 | if ( b.testBit(f[ell].parent) ) { | ||
1947 | b.setBit( ell, TRUE ); | ||
1948 | cap = f[ell].capture; | ||
1949 | if ( cap >= 0 ) { | ||
1950 | capBegin[cap] = EmptyCapture; | ||
1951 | capEnd[cap] = EmptyCapture; | ||
1952 | } | ||
1953 | } | ||
1954 | } | ||
1955 | p = f[q].parent; | ||
1956 | |||
1957 | /* | ||
1958 | Otherwise, close the capture zones we are leaving. | ||
1959 | We are leaving f[c].capture, f[f[c].parent].capture, | ||
1960 | f[f[f[c].parent].parent].capture, ..., until | ||
1961 | f[x].capture, with x such that f[x].parent is the | ||
1962 | youngest common ancestor for c and n. | ||
1963 | |||
1964 | We go up along c's and n's ancestry until we find x. | ||
1965 | */ | ||
1966 | } else { | ||
1967 | p = c; | ||
1968 | q = n; | ||
1969 | while ( p != q ) { | ||
1970 | if ( p > q ) { | ||
1971 | cap = f[p].capture; | ||
1972 | if ( cap >= 0 ) { | ||
1973 | if ( capBegin[cap] == i ) { | ||
1974 | capBegin[cap] = EmptyCapture; | ||
1975 | capEnd[cap] = EmptyCapture; | ||
1976 | } else { | ||
1977 | capEnd[cap] = i; | ||
1978 | } | ||
1979 | } | ||
1980 | p = f[p].parent; | ||
1981 | } else { | ||
1982 | q = f[q].parent; | ||
1983 | } | ||
1984 | } | ||
1985 | } | ||
1986 | |||
1987 | /* | ||
1988 | In any case, we now open the capture zones we are | ||
1989 | entering. We work upwards from n until we reach p | ||
1990 | (the parent of the atom we reenter or the youngest | ||
1991 | common ancestor). | ||
1992 | */ | ||
1993 | while ( n > p ) { | ||
1994 | cap = f[n].capture; | ||
1995 | if ( cap >= 0 ) { | ||
1996 | capBegin[cap] = i; | ||
1997 | capEnd[cap] = EmptyCapture; | ||
1998 | } | ||
1999 | n = f[n].parent; | ||
2000 | } | ||
2001 | /* | ||
2002 | If the next state was already in mmNextStack, we must | ||
2003 | choose carefully which capture zones we want to keep. | ||
2004 | */ | ||
2005 | if ( capBegin == mmTempCapBegin && | ||
2006 | isBetterCapture(capBegin, capEnd, | ||
2007 | mmNextCapBegin + m * ncap, | ||
2008 | mmNextCapEnd + m * ncap) ) { | ||
2009 | memcpy( mmNextCapBegin + m * ncap, capBegin, | ||
2010 | ncap * sizeof(int) ); | ||
2011 | memcpy( mmNextCapEnd + m * ncap, capEnd, | ||
2012 | ncap * sizeof(int) ); | ||
2013 | } | ||
2014 | } | ||
2015 | #ifndef QT_NO_REGEXP_BACKREF | ||
2016 | /* | ||
2017 | We are done with updating the capture zones. It's now | ||
2018 | time to put the next state to sleep, if it needs to, and | ||
2019 | to remove it from mmNextStack. | ||
2020 | */ | ||
2021 | if ( needSomeSleep > 0 ) { | ||
2022 | zzZ = new int[1 + 2 * ncap]; | ||
2023 | zzZ[0] = next; | ||
2024 | if ( ncap > 0 ) { | ||
2025 | memcpy( zzZ + 1, capBegin, ncap * sizeof(int) ); | ||
2026 | memcpy( zzZ + 1 + ncap, capEnd, | ||
2027 | ncap * sizeof(int) ); | ||
2028 | } | ||
2029 | mmInNextStack[mmNextStack[--nnext]] = -1; | ||
2030 | mmSleeping.insert( i + needSomeSleep, zzZ ); | ||
2031 | } | ||
2032 | #endif | ||
2033 | #endif | ||
2034 | } | ||
2035 | } | ||
2036 | } | ||
2037 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2038 | /* | ||
2039 | If we reached the final state, hurray! Copy the captured zone. | ||
2040 | */ | ||
2041 | if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) { | ||
2042 | memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) ); | ||
2043 | memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) ); | ||
2044 | } | ||
2045 | #ifndef QT_NO_REGEXP_BACKREF | ||
2046 | /* | ||
2047 | It's time to wake up the sleepers. | ||
2048 | */ | ||
2049 | if ( mmSleeping.count() > 0 ) { | ||
2050 | while ( (zzZ = mmSleeping.take(i)) != 0 ) { | ||
2051 | int next = zzZ[0]; | ||
2052 | int *capBegin = zzZ + 1; | ||
2053 | int *capEnd = zzZ + 1 + ncap; | ||
2054 | bool copyOver = TRUE; | ||
2055 | |||
2056 | if ( (m = mmInNextStack[zzZ[0]]) == -1 ) { | ||
2057 | m = nnext++; | ||
2058 | mmNextStack[m] = next; | ||
2059 | mmInNextStack[next] = m; | ||
2060 | } else { | ||
2061 | copyOver = isBetterCapture( mmNextCapBegin + m * ncap, | ||
2062 | mmNextCapEnd + m * ncap, | ||
2063 | capBegin, capEnd ); | ||
2064 | } | ||
2065 | if ( copyOver ) { | ||
2066 | memcpy( mmNextCapBegin + m * ncap, capBegin, | ||
2067 | ncap * sizeof(int) ); | ||
2068 | memcpy( mmNextCapEnd + m * ncap, capEnd, | ||
2069 | ncap * sizeof(int) ); | ||
2070 | } | ||
2071 | delete[] zzZ; | ||
2072 | } | ||
2073 | } | ||
2074 | #endif | ||
2075 | #endif | ||
2076 | for ( j = 0; j < nnext; j++ ) | ||
2077 | mmInNextStack[mmNextStack[j]] = -1; | ||
2078 | |||
2079 | qSwap( mmCurStack, mmNextStack ); | ||
2080 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2081 | qSwap( mmCurCapBegin, mmNextCapBegin ); | ||
2082 | qSwap( mmCurCapEnd, mmNextCapEnd ); | ||
2083 | #endif | ||
2084 | ncur = nnext; | ||
2085 | nnext = 0; | ||
2086 | i++; | ||
2087 | } | ||
2088 | |||
2089 | #ifndef QT_NO_REGEXP_BACKREF | ||
2090 | /* | ||
2091 | If minimal matching is enabled, we might have some sleepers left. | ||
2092 | */ | ||
2093 | while ( !mmSleeping.isEmpty() ) { | ||
2094 | zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) ); | ||
2095 | delete[] zzZ; | ||
2096 | } | ||
2097 | #endif | ||
2098 | |||
2099 | match = ( mmMatchedLen >= 0 ); | ||
2100 | if ( !match ) | ||
2101 | mmMatchedLen = i - 1; | ||
2102 | return match; | ||
2103 | } | ||
2104 | |||
2105 | #ifndef QT_NO_REGEXP_CCLASS | ||
2106 | |||
2107 | QRegExpEngine::CharClass::CharClass() | ||
2108 | : c( 0 ), n( FALSE ) | ||
2109 | #ifndef QT_NO_REGEXP_OPTIM | ||
2110 | , occ1( *noOccurrences ) | ||
2111 | #endif | ||
2112 | { | ||
2113 | } | ||
2114 | |||
2115 | QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=( | ||
2116 | const CharClass& cc ) | ||
2117 | { | ||
2118 | c = cc.c; | ||
2119 | r = cc.r.copy(); | ||
2120 | n = cc.n; | ||
2121 | #ifndef QT_NO_REGEXP_OPTIM | ||
2122 | occ1 = cc.occ1; | ||
2123 | #endif | ||
2124 | return *this; | ||
2125 | } | ||
2126 | |||
2127 | void QRegExpEngine::CharClass::clear() | ||
2128 | { | ||
2129 | c = 0; | ||
2130 | r.resize( 0 ); | ||
2131 | n = FALSE; | ||
2132 | } | ||
2133 | |||
2134 | void QRegExpEngine::CharClass::setNegative( bool negative ) | ||
2135 | { | ||
2136 | n = negative; | ||
2137 | #ifndef QT_NO_REGEXP_OPTIM | ||
2138 | occ1 = *firstOccurrenceAtZero; | ||
2139 | #endif | ||
2140 | } | ||
2141 | |||
2142 | void QRegExpEngine::CharClass::addCategories( int cats ) | ||
2143 | { | ||
2144 | c |= cats; | ||
2145 | #ifndef QT_NO_REGEXP_OPTIM | ||
2146 | occ1 = *firstOccurrenceAtZero; | ||
2147 | #endif | ||
2148 | } | ||
2149 | |||
2150 | void QRegExpEngine::CharClass::addRange( ushort from, ushort to ) | ||
2151 | { | ||
2152 | if ( from > to ) | ||
2153 | qSwap( from, to ); | ||
2154 | int n = r.size(); | ||
2155 | r.resize( n + 1 ); | ||
2156 | r[n].from = from; | ||
2157 | r[n].to = to; | ||
2158 | |||
2159 | #ifndef QT_NO_REGEXP_OPTIM | ||
2160 | int i; | ||
2161 | |||
2162 | if ( to - from < NumBadChars ) { | ||
2163 | occ1.detach(); | ||
2164 | if ( from % NumBadChars <= to % NumBadChars ) { | ||
2165 | for ( i = from % NumBadChars; i <= to % NumBadChars; i++ ) | ||
2166 | occ1[i] = 0; | ||
2167 | } else { | ||
2168 | for ( i = 0; i <= to % NumBadChars; i++ ) | ||
2169 | occ1[i] = 0; | ||
2170 | for ( i = from % NumBadChars; i < NumBadChars; i++ ) | ||
2171 | occ1[i] = 0; | ||
2172 | } | ||
2173 | } else { | ||
2174 | occ1 = *firstOccurrenceAtZero; | ||
2175 | } | ||
2176 | #endif | ||
2177 | } | ||
2178 | |||
2179 | bool QRegExpEngine::CharClass::in( QChar ch ) const | ||
2180 | { | ||
2181 | #ifndef QT_NO_REGEXP_OPTIM | ||
2182 | if ( occ1[BadChar(ch)] == NoOccurrence ) | ||
2183 | return n; | ||
2184 | #endif | ||
2185 | |||
2186 | if ( c != 0 && (c & (1 << (int) ch.category())) != 0 ) | ||
2187 | return !n; | ||
2188 | for ( int i = 0; i < (int) r.size(); i++ ) { | ||
2189 | if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to ) | ||
2190 | return !n; | ||
2191 | } | ||
2192 | return n; | ||
2193 | } | ||
2194 | |||
2195 | #if defined(QT_DEBUG) | ||
2196 | void QRegExpEngine::CharClass::dump() const | ||
2197 | { | ||
2198 | int i; | ||
2199 | qDebug( " %stive character class", n ? "nega" : "posi" ); | ||
2200 | #ifndef QT_NO_REGEXP_CCLASS | ||
2201 | if ( c != 0 ) | ||
2202 | qDebug( " categories 0x%.8x", c ); | ||
2203 | #endif | ||
2204 | for ( i = 0; i < (int) r.size(); i++ ) | ||
2205 | qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to ); | ||
2206 | } | ||
2207 | #endif | ||
2208 | #endif | ||
2209 | |||
2210 | QRegExpEngine::Box::Box( QRegExpEngine *engine ) | ||
2211 | : eng( engine ), skipanchors( 0 ) | ||
2212 | #ifndef QT_NO_REGEXP_OPTIM | ||
2213 | , earlyStart( 0 ), lateStart( 0 ), maxl( 0 ), occ1( *noOccurrences ) | ||
2214 | #endif | ||
2215 | { | ||
2216 | minl = 0; | ||
2217 | } | ||
2218 | |||
2219 | QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b ) | ||
2220 | { | ||
2221 | eng = b.eng; | ||
2222 | ls = b.ls; | ||
2223 | rs = b.rs; | ||
2224 | lanchors = b.lanchors; | ||
2225 | ranchors = b.ranchors; | ||
2226 | skipanchors = b.skipanchors; | ||
2227 | #ifndef QT_NO_REGEXP_OPTIM | ||
2228 | earlyStart = b.earlyStart; | ||
2229 | lateStart = b.lateStart; | ||
2230 | str = b.str; | ||
2231 | leftStr = b.leftStr; | ||
2232 | rightStr = b.rightStr; | ||
2233 | maxl = b.maxl; | ||
2234 | occ1 = b.occ1; | ||
2235 | #endif | ||
2236 | minl = b.minl; | ||
2237 | return *this; | ||
2238 | } | ||
2239 | |||
2240 | void QRegExpEngine::Box::set( QChar ch ) | ||
2241 | { | ||
2242 | ls.resize( 1 ); | ||
2243 | ls[0] = eng->createState( ch ); | ||
2244 | rs = ls; | ||
2245 | rs.detach(); | ||
2246 | #ifndef QT_NO_REGEXP_OPTIM | ||
2247 | str = ch; | ||
2248 | leftStr = ch; | ||
2249 | rightStr = ch; | ||
2250 | maxl = 1; | ||
2251 | occ1.detach(); | ||
2252 | occ1[BadChar(ch)] = 0; | ||
2253 | #endif | ||
2254 | minl = 1; | ||
2255 | } | ||
2256 | |||
2257 | void QRegExpEngine::Box::set( const CharClass& cc ) | ||
2258 | { | ||
2259 | ls.resize( 1 ); | ||
2260 | ls[0] = eng->createState( cc ); | ||
2261 | rs = ls; | ||
2262 | rs.detach(); | ||
2263 | #ifndef QT_NO_REGEXP_OPTIM | ||
2264 | maxl = 1; | ||
2265 | occ1 = cc.firstOccurrence(); | ||
2266 | #endif | ||
2267 | minl = 1; | ||
2268 | } | ||
2269 | |||
2270 | #ifndef QT_NO_REGEXP_BACKREF | ||
2271 | void QRegExpEngine::Box::set( int bref ) | ||
2272 | { | ||
2273 | ls.resize( 1 ); | ||
2274 | ls[0] = eng->createState( bref ); | ||
2275 | rs = ls; | ||
2276 | rs.detach(); | ||
2277 | if ( bref >= 1 && bref <= MaxBackRefs ) | ||
2278 | skipanchors = Anchor_BackRef0Empty << bref; | ||
2279 | #ifndef QT_NO_REGEXP_OPTIM | ||
2280 | maxl = InftyLen; | ||
2281 | #endif | ||
2282 | minl = 0; | ||
2283 | } | ||
2284 | #endif | ||
2285 | |||
2286 | void QRegExpEngine::Box::cat( const Box& b ) | ||
2287 | { | ||
2288 | eng->addCatTransitions( rs, b.ls ); | ||
2289 | addAnchorsToEngine( b ); | ||
2290 | if ( minl == 0 ) { | ||
2291 | mergeInto( &lanchors, b.lanchors ); | ||
2292 | if ( skipanchors != 0 ) { | ||
2293 | for ( int i = 0; i < (int) b.ls.size(); i++ ) { | ||
2294 | int a = eng->anchorConcatenation( at(lanchors, b.ls[i]), | ||
2295 | skipanchors ); | ||
2296 | lanchors.insert( b.ls[i], a ); | ||
2297 | } | ||
2298 | } | ||
2299 | mergeInto( &ls, b.ls ); | ||
2300 | } | ||
2301 | if ( b.minl == 0 ) { | ||
2302 | mergeInto( &ranchors, b.ranchors ); | ||
2303 | if ( b.skipanchors != 0 ) { | ||
2304 | for ( int i = 0; i < (int) rs.size(); i++ ) { | ||
2305 | int a = eng->anchorConcatenation( at(ranchors, rs[i]), | ||
2306 | b.skipanchors ); | ||
2307 | ranchors.insert( rs[i], a ); | ||
2308 | } | ||
2309 | } | ||
2310 | mergeInto( &rs, b.rs ); | ||
2311 | } else { | ||
2312 | ranchors = b.ranchors; | ||
2313 | rs = b.rs; | ||
2314 | } | ||
2315 | |||
2316 | #ifndef QT_NO_REGEXP_OPTIM | ||
2317 | if ( maxl != InftyLen ) { | ||
2318 | if ( rightStr.length() + b.leftStr.length() > | ||
2319 | QMAX(str.length(), b.str.length()) ) { | ||
2320 | earlyStart = minl - rightStr.length(); | ||
2321 | lateStart = maxl - rightStr.length(); | ||
2322 | str = rightStr + b.leftStr; | ||
2323 | } else if ( b.str.length() > str.length() ) { | ||
2324 | earlyStart = minl + b.earlyStart; | ||
2325 | lateStart = maxl + b.lateStart; | ||
2326 | str = b.str; | ||
2327 | } | ||
2328 | } | ||
2329 | |||
2330 | if ( (int) leftStr.length() == maxl ) | ||
2331 | leftStr += b.leftStr; | ||
2332 | if ( (int) b.rightStr.length() == b.maxl ) | ||
2333 | rightStr += b.rightStr; | ||
2334 | else | ||
2335 | rightStr = b.rightStr; | ||
2336 | |||
2337 | if ( maxl == InftyLen || b.maxl == InftyLen ) | ||
2338 | maxl = InftyLen; | ||
2339 | else | ||
2340 | maxl += b.maxl; | ||
2341 | |||
2342 | occ1.detach(); | ||
2343 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2344 | if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] ) | ||
2345 | occ1[i] = minl + b.occ1[i]; | ||
2346 | } | ||
2347 | #endif | ||
2348 | |||
2349 | minl += b.minl; | ||
2350 | if ( minl == 0 ) | ||
2351 | skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors ); | ||
2352 | else | ||
2353 | skipanchors = 0; | ||
2354 | } | ||
2355 | |||
2356 | void QRegExpEngine::Box::orx( const Box& b ) | ||
2357 | { | ||
2358 | mergeInto( &ls, b.ls ); | ||
2359 | mergeInto( &lanchors, b.lanchors ); | ||
2360 | mergeInto( &rs, b.rs ); | ||
2361 | mergeInto( &ranchors, b.ranchors ); | ||
2362 | skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors ); | ||
2363 | |||
2364 | #ifndef QT_NO_REGEXP_OPTIM | ||
2365 | occ1.detach(); | ||
2366 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2367 | if ( occ1[i] > b.occ1[i] ) | ||
2368 | occ1[i] = b.occ1[i]; | ||
2369 | } | ||
2370 | earlyStart = 0; | ||
2371 | lateStart = 0; | ||
2372 | str = QString::null; | ||
2373 | leftStr = QString::null; | ||
2374 | rightStr = QString::null; | ||
2375 | if ( b.maxl > maxl ) | ||
2376 | maxl = b.maxl; | ||
2377 | #endif | ||
2378 | if ( b.minl < minl ) | ||
2379 | minl = b.minl; | ||
2380 | } | ||
2381 | |||
2382 | void QRegExpEngine::Box::plus( int atom ) | ||
2383 | { | ||
2384 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2385 | eng->addPlusTransitions( rs, ls, atom ); | ||
2386 | #else | ||
2387 | Q_UNUSED( atom ); | ||
2388 | eng->addCatTransitions( rs, ls ); | ||
2389 | #endif | ||
2390 | addAnchorsToEngine( *this ); | ||
2391 | #ifndef QT_NO_REGEXP_OPTIM | ||
2392 | maxl = InftyLen; | ||
2393 | #endif | ||
2394 | } | ||
2395 | |||
2396 | void QRegExpEngine::Box::opt() | ||
2397 | { | ||
2398 | #ifndef QT_NO_REGEXP_OPTIM | ||
2399 | earlyStart = 0; | ||
2400 | lateStart = 0; | ||
2401 | str = QString::null; | ||
2402 | leftStr = QString::null; | ||
2403 | rightStr = QString::null; | ||
2404 | #endif | ||
2405 | skipanchors = 0; | ||
2406 | minl = 0; | ||
2407 | } | ||
2408 | |||
2409 | void QRegExpEngine::Box::catAnchor( int a ) | ||
2410 | { | ||
2411 | if ( a != 0 ) { | ||
2412 | for ( int i = 0; i < (int) rs.size(); i++ ) { | ||
2413 | a = eng->anchorConcatenation( at(ranchors, rs[i]), a ); | ||
2414 | ranchors.insert( rs[i], a ); | ||
2415 | } | ||
2416 | if ( minl == 0 ) | ||
2417 | skipanchors = eng->anchorConcatenation( skipanchors, a ); | ||
2418 | } | ||
2419 | } | ||
2420 | |||
2421 | #ifndef QT_NO_REGEXP_OPTIM | ||
2422 | void QRegExpEngine::Box::setupHeuristics() | ||
2423 | { | ||
2424 | eng->setupGoodStringHeuristic( earlyStart, lateStart, str ); | ||
2425 | |||
2426 | /* | ||
2427 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = 1 at this | ||
2428 | point. An entry of occ1 has to be at most minl or infinity for the rest | ||
2429 | of the algorithm to go well. | ||
2430 | |||
2431 | We waited until here before normalizing these cases (instead of doing it | ||
2432 | in Box::orx()) because sometimes things improve by themselves; consider | ||
2433 | (112|1)34. | ||
2434 | */ | ||
2435 | for ( int i = 0; i < NumBadChars; i++ ) { | ||
2436 | if ( occ1[i] != NoOccurrence && occ1[i] >= minl ) | ||
2437 | occ1[i] = minl; | ||
2438 | } | ||
2439 | eng->setupBadCharHeuristic( minl, occ1 ); | ||
2440 | |||
2441 | eng->heuristicallyChooseHeuristic(); | ||
2442 | } | ||
2443 | #endif | ||
2444 | |||
2445 | #if defined(QT_DEBUG) | ||
2446 | void QRegExpEngine::Box::dump() const | ||
2447 | { | ||
2448 | int i; | ||
2449 | qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" ); | ||
2450 | qDebug( " Left states:" ); | ||
2451 | for ( i = 0; i < (int) ls.size(); i++ ) { | ||
2452 | if ( at(lanchors, ls[i]) == 0 ) | ||
2453 | qDebug( " %d", ls[i] ); | ||
2454 | else | ||
2455 | qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] ); | ||
2456 | } | ||
2457 | qDebug( " Right states:" ); | ||
2458 | for ( i = 0; i < (int) rs.size(); i++ ) { | ||
2459 | if ( at(ranchors, ls[i]) == 0 ) | ||
2460 | qDebug( " %d", rs[i] ); | ||
2461 | else | ||
2462 | qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] ); | ||
2463 | } | ||
2464 | qDebug( " Skip anchors: 0x%.8x", skipanchors ); | ||
2465 | } | ||
2466 | #endif | ||
2467 | |||
2468 | void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const | ||
2469 | { | ||
2470 | for ( int i = 0; i < (int) to.ls.size(); i++ ) { | ||
2471 | for ( int j = 0; j < (int) rs.size(); j++ ) { | ||
2472 | int a = eng->anchorConcatenation( at(ranchors, rs[j]), | ||
2473 | at(to.lanchors, to.ls[i]) ); | ||
2474 | eng->addAnchors( rs[j], to.ls[i], a ); | ||
2475 | } | ||
2476 | } | ||
2477 | } | ||
2478 | |||
2479 | int QRegExpEngine::getChar() | ||
2480 | { | ||
2481 | return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode(); | ||
2482 | } | ||
2483 | |||
2484 | int QRegExpEngine::getEscape() | ||
2485 | { | ||
2486 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2487 | const char tab[] = "afnrtv"; // no b, as \b means word boundary | ||
2488 | const char backTab[] = "\a\f\n\r\t\v"; | ||
2489 | ushort low; | ||
2490 | int i; | ||
2491 | #endif | ||
2492 | ushort val; | ||
2493 | int prevCh = yyCh; | ||
2494 | |||
2495 | if ( prevCh == EOS ) { | ||
2496 | yyError = TRUE; | ||
2497 | return Tok_Char | '\\'; | ||
2498 | } | ||
2499 | yyCh = getChar(); | ||
2500 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2501 | if ( (prevCh & ~0xff) == 0 ) { | ||
2502 | const char *p = strchr( tab, prevCh ); | ||
2503 | if ( p != 0 ) | ||
2504 | return Tok_Char | backTab[p - tab]; | ||
2505 | } | ||
2506 | #endif | ||
2507 | |||
2508 | switch ( prevCh ) { | ||
2509 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2510 | case '0': | ||
2511 | val = 0; | ||
2512 | for ( i = 0; i < 3; i++ ) { | ||
2513 | if ( yyCh >= '0' && yyCh <= '7' ) | ||
2514 | val = ( val << 3 ) | ( yyCh - '0' ); | ||
2515 | else | ||
2516 | break; | ||
2517 | yyCh = getChar(); | ||
2518 | } | ||
2519 | if ( (val & ~0377) != 0 ) | ||
2520 | yyError = TRUE; | ||
2521 | return Tok_Char | val; | ||
2522 | #endif | ||
2523 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2524 | case 'B': | ||
2525 | return Tok_NonWord; | ||
2526 | #endif | ||
2527 | #ifndef QT_NO_REGEXP_CCLASS | ||
2528 | case 'D': | ||
2529 | // see QChar::isDigit() | ||
2530 | yyCharClass->addCategories( 0x7fffffef ); | ||
2531 | return Tok_CharClass; | ||
2532 | case 'S': | ||
2533 | // see QChar::isSpace() | ||
2534 | yyCharClass->addCategories( 0x7ffff87f ); | ||
2535 | yyCharClass->addRange( 0x0000, 0x0008 ); | ||
2536 | yyCharClass->addRange( 0x000e, 0x001f ); | ||
2537 | yyCharClass->addRange( 0x007f, 0x009f ); | ||
2538 | return Tok_CharClass; | ||
2539 | case 'W': | ||
2540 | // see QChar::isLetterOrNumber() | ||
2541 | yyCharClass->addCategories( 0x7ff07f8f ); | ||
2542 | return Tok_CharClass; | ||
2543 | #endif | ||
2544 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2545 | case 'b': | ||
2546 | return Tok_Word; | ||
2547 | #endif | ||
2548 | #ifndef QT_NO_REGEXP_CCLASS | ||
2549 | case 'd': | ||
2550 | // see QChar::isDigit() | ||
2551 | yyCharClass->addCategories( 0x00000010 ); | ||
2552 | return Tok_CharClass; | ||
2553 | case 's': | ||
2554 | // see QChar::isSpace() | ||
2555 | yyCharClass->addCategories( 0x00000380 ); | ||
2556 | yyCharClass->addRange( 0x0009, 0x000d ); | ||
2557 | return Tok_CharClass; | ||
2558 | case 'w': | ||
2559 | // see QChar::isLetterOrNumber() | ||
2560 | yyCharClass->addCategories( 0x000f8070 ); | ||
2561 | return Tok_CharClass; | ||
2562 | #endif | ||
2563 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2564 | case 'x': | ||
2565 | val = 0; | ||
2566 | for ( i = 0; i < 4; i++ ) { | ||
2567 | low = QChar( yyCh ).lower(); | ||
2568 | if ( low >= '0' && low <= '9' ) | ||
2569 | val = ( val << 4 ) | ( low - '0' ); | ||
2570 | else if ( low >= 'a' && low <= 'f' ) | ||
2571 | val = ( val << 4 ) | ( low - 'a' + 10 ); | ||
2572 | else | ||
2573 | break; | ||
2574 | yyCh = getChar(); | ||
2575 | } | ||
2576 | return Tok_Char | val; | ||
2577 | #endif | ||
2578 | default: | ||
2579 | if ( prevCh >= '1' && prevCh <= '9' ) { | ||
2580 | #ifndef QT_NO_REGEXP_BACKREF | ||
2581 | val = prevCh - '0'; | ||
2582 | while ( yyCh >= '0' && yyCh <= '9' ) { | ||
2583 | val = ( val *= 10 ) | ( yyCh - '0' ); | ||
2584 | yyCh = getChar(); | ||
2585 | } | ||
2586 | return Tok_BackRef | val; | ||
2587 | #else | ||
2588 | yyError = TRUE; | ||
2589 | #endif | ||
2590 | } | ||
2591 | return Tok_Char | prevCh; | ||
2592 | } | ||
2593 | } | ||
2594 | |||
2595 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2596 | int QRegExpEngine::getRep( int def ) | ||
2597 | { | ||
2598 | if ( yyCh >= '0' && yyCh <= '9' ) { | ||
2599 | int rep = 0; | ||
2600 | do { | ||
2601 | rep = 10 * rep + yyCh - '0'; | ||
2602 | if ( rep >= InftyRep ) { | ||
2603 | yyError = TRUE; | ||
2604 | rep = def; | ||
2605 | } | ||
2606 | yyCh = getChar(); | ||
2607 | } while ( yyCh >= '0' && yyCh <= '9' ); | ||
2608 | return rep; | ||
2609 | } else { | ||
2610 | return def; | ||
2611 | } | ||
2612 | } | ||
2613 | #endif | ||
2614 | |||
2615 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2616 | void QRegExpEngine::skipChars( int n ) | ||
2617 | { | ||
2618 | if ( n > 0 ) { | ||
2619 | yyPos += n - 1; | ||
2620 | yyCh = getChar(); | ||
2621 | } | ||
2622 | } | ||
2623 | #endif | ||
2624 | |||
2625 | void QRegExpEngine::startTokenizer( const QChar *rx, int len ) | ||
2626 | { | ||
2627 | yyIn = rx; | ||
2628 | yyPos0 = 0; | ||
2629 | yyPos = 0; | ||
2630 | yyLen = len; | ||
2631 | yyCh = getChar(); | ||
2632 | yyCharClass = new CharClass; | ||
2633 | yyMinRep = 0; | ||
2634 | yyMaxRep = 0; | ||
2635 | yyError = FALSE; | ||
2636 | } | ||
2637 | |||
2638 | int QRegExpEngine::getToken() | ||
2639 | { | ||
2640 | #ifndef QT_NO_REGEXP_CCLASS | ||
2641 | ushort pendingCh = 0; | ||
2642 | bool charPending; | ||
2643 | bool rangePending; | ||
2644 | int tok; | ||
2645 | #endif | ||
2646 | int prevCh = yyCh; | ||
2647 | |||
2648 | yyPos0 = yyPos - 1; | ||
2649 | #ifndef QT_NO_REGEXP_CCLASS | ||
2650 | yyCharClass->clear(); | ||
2651 | #endif | ||
2652 | yyMinRep = 0; | ||
2653 | yyMaxRep = 0; | ||
2654 | yyCh = getChar(); | ||
2655 | switch ( prevCh ) { | ||
2656 | case EOS: | ||
2657 | yyPos0 = yyPos; | ||
2658 | return Tok_Eos; | ||
2659 | case '$': | ||
2660 | return Tok_Dollar; | ||
2661 | case '(': | ||
2662 | if ( yyCh == '?' ) { | ||
2663 | prevCh = getChar(); | ||
2664 | yyCh = getChar(); | ||
2665 | switch ( prevCh ) { | ||
2666 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2667 | case '!': | ||
2668 | return Tok_NegLookahead; | ||
2669 | case '=': | ||
2670 | return Tok_PosLookahead; | ||
2671 | #endif | ||
2672 | case ':': | ||
2673 | return Tok_MagicLeftParen; | ||
2674 | default: | ||
2675 | yyError = TRUE; | ||
2676 | return Tok_MagicLeftParen; | ||
2677 | } | ||
2678 | } else { | ||
2679 | return Tok_LeftParen; | ||
2680 | } | ||
2681 | case ')': | ||
2682 | return Tok_RightParen; | ||
2683 | case '*': | ||
2684 | yyMinRep = 0; | ||
2685 | yyMaxRep = InftyRep; | ||
2686 | return Tok_Quantifier; | ||
2687 | case '+': | ||
2688 | yyMinRep = 1; | ||
2689 | yyMaxRep = InftyRep; | ||
2690 | return Tok_Quantifier; | ||
2691 | case '.': | ||
2692 | #ifndef QT_NO_REGEXP_CCLASS | ||
2693 | yyCharClass->setNegative( TRUE ); | ||
2694 | #endif | ||
2695 | return Tok_CharClass; | ||
2696 | case '?': | ||
2697 | yyMinRep = 0; | ||
2698 | yyMaxRep = 1; | ||
2699 | return Tok_Quantifier; | ||
2700 | case '[': | ||
2701 | #ifndef QT_NO_REGEXP_CCLASS | ||
2702 | if ( yyCh == '^' ) { | ||
2703 | yyCharClass->setNegative( TRUE ); | ||
2704 | yyCh = getChar(); | ||
2705 | } | ||
2706 | charPending = FALSE; | ||
2707 | rangePending = FALSE; | ||
2708 | do { | ||
2709 | if ( yyCh == '-' && charPending && !rangePending ) { | ||
2710 | rangePending = TRUE; | ||
2711 | yyCh = getChar(); | ||
2712 | } else { | ||
2713 | if ( charPending && !rangePending ) { | ||
2714 | yyCharClass->addSingleton( pendingCh ); | ||
2715 | charPending = FALSE; | ||
2716 | } | ||
2717 | if ( yyCh == '\\' ) { | ||
2718 | yyCh = getChar(); | ||
2719 | tok = getEscape(); | ||
2720 | if ( tok == Tok_Word ) | ||
2721 | tok = '\b'; | ||
2722 | } else { | ||
2723 | tok = Tok_Char | yyCh; | ||
2724 | yyCh = getChar(); | ||
2725 | } | ||
2726 | if ( tok == Tok_CharClass ) { | ||
2727 | if ( rangePending ) { | ||
2728 | yyCharClass->addSingleton( '-' ); | ||
2729 | yyCharClass->addSingleton( pendingCh ); | ||
2730 | charPending = FALSE; | ||
2731 | rangePending = FALSE; | ||
2732 | } | ||
2733 | } else if ( (tok & Tok_Char) != 0 ) { | ||
2734 | if ( rangePending ) { | ||
2735 | yyCharClass->addRange( pendingCh, tok ^ Tok_Char ); | ||
2736 | charPending = FALSE; | ||
2737 | rangePending = FALSE; | ||
2738 | } else { | ||
2739 | pendingCh = tok ^ Tok_Char; | ||
2740 | charPending = TRUE; | ||
2741 | } | ||
2742 | } else { | ||
2743 | yyError = TRUE; | ||
2744 | } | ||
2745 | } | ||
2746 | } while ( yyCh != ']' && yyCh != EOS ); | ||
2747 | if ( rangePending ) | ||
2748 | yyCharClass->addSingleton( '-' ); | ||
2749 | if ( charPending ) | ||
2750 | yyCharClass->addSingleton( pendingCh ); | ||
2751 | if ( yyCh == EOS ) | ||
2752 | yyError = TRUE; | ||
2753 | else | ||
2754 | yyCh = getChar(); | ||
2755 | return Tok_CharClass; | ||
2756 | #else | ||
2757 | yyError = TRUE; | ||
2758 | return Tok_Char | '['; | ||
2759 | #endif | ||
2760 | case '\\': | ||
2761 | return getEscape(); | ||
2762 | case ']': | ||
2763 | yyError = TRUE; | ||
2764 | return Tok_Char | ']'; | ||
2765 | case '^': | ||
2766 | return Tok_Caret; | ||
2767 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2768 | case '{': | ||
2769 | yyMinRep = getRep( 0 ); | ||
2770 | yyMaxRep = yyMinRep; | ||
2771 | if ( yyCh == ',' ) { | ||
2772 | yyCh = getChar(); | ||
2773 | yyMaxRep = getRep( InftyRep ); | ||
2774 | } | ||
2775 | if ( yyMaxRep < yyMinRep ) | ||
2776 | qSwap( yyMinRep, yyMaxRep ); | ||
2777 | if ( yyCh != '}' ) | ||
2778 | yyError = TRUE; | ||
2779 | yyCh = getChar(); | ||
2780 | return Tok_Quantifier; | ||
2781 | #else | ||
2782 | yyError = TRUE; | ||
2783 | return Tok_Char | '{'; | ||
2784 | #endif | ||
2785 | case '|': | ||
2786 | return Tok_Bar; | ||
2787 | case '}': | ||
2788 | yyError = TRUE; | ||
2789 | return Tok_Char | '}'; | ||
2790 | default: | ||
2791 | return Tok_Char | prevCh; | ||
2792 | } | ||
2793 | } | ||
2794 | |||
2795 | int QRegExpEngine::parse( const QChar *pattern, int len ) | ||
2796 | { | ||
2797 | valid = TRUE; | ||
2798 | startTokenizer( pattern, len ); | ||
2799 | yyTok = getToken(); | ||
2800 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2801 | yyMayCapture = TRUE; | ||
2802 | #else | ||
2803 | yyMayCapture = FALSE; | ||
2804 | #endif | ||
2805 | |||
2806 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2807 | int atom = startAtom( FALSE ); | ||
2808 | #endif | ||
2809 | CharClass anything; | ||
2810 | Box box( this ); // create InitialState | ||
2811 | box.set( anything ); | ||
2812 | Box rightBox( this ); // create FinalState | ||
2813 | rightBox.set( anything ); | ||
2814 | |||
2815 | Box middleBox( this ); | ||
2816 | parseExpression( &middleBox ); | ||
2817 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2818 | finishAtom( atom ); | ||
2819 | #endif | ||
2820 | #ifndef QT_NO_REGEXP_OPTIM | ||
2821 | middleBox.setupHeuristics(); | ||
2822 | #endif | ||
2823 | box.cat( middleBox ); | ||
2824 | box.cat( rightBox ); | ||
2825 | delete yyCharClass; | ||
2826 | yyCharClass = 0; | ||
2827 | |||
2828 | realncap = ncap; | ||
2829 | #ifndef QT_NO_REGEXP_BACKREF | ||
2830 | if ( nbrefs > ncap ) | ||
2831 | ncap = nbrefs; | ||
2832 | #endif | ||
2833 | |||
2834 | mmCaptured.resize( 2 + 2 * realncap ); | ||
2835 | mmCapturedNoMatch.fill( -1, 2 + 2 * realncap ); | ||
2836 | |||
2837 | /* | ||
2838 | We use one QArray<int> for all the big data used a lot in matchHere() and | ||
2839 | friends. | ||
2840 | */ | ||
2841 | #ifndef QT_NO_REGEXP_OPTIM | ||
2842 | mmSlideTabSize = QMAX( minl + 1, 16 ); | ||
2843 | #else | ||
2844 | mmSlideTabSize = 0; | ||
2845 | #endif | ||
2846 | mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize ); | ||
2847 | |||
2848 | mmInNextStack = mmBigArray.data(); | ||
2849 | memset( mmInNextStack, -1, ns * sizeof(int) ); | ||
2850 | mmCurStack = mmInNextStack + ns; | ||
2851 | mmNextStack = mmInNextStack + 2 * ns; | ||
2852 | |||
2853 | mmCurCapBegin = mmInNextStack + 3 * ns; | ||
2854 | mmNextCapBegin = mmCurCapBegin + ncap * ns; | ||
2855 | mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns; | ||
2856 | mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns; | ||
2857 | |||
2858 | mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns; | ||
2859 | mmTempCapEnd = mmTempCapBegin + ncap; | ||
2860 | mmCapBegin = mmTempCapBegin + 2 * ncap; | ||
2861 | mmCapEnd = mmTempCapBegin + 3 * ncap; | ||
2862 | |||
2863 | mmSlideTab = mmTempCapBegin + 4 * ncap; | ||
2864 | |||
2865 | if ( yyError ) | ||
2866 | return -1; | ||
2867 | |||
2868 | #ifndef QT_NO_REGEXP_OPTIM | ||
2869 | State *sinit = s[InitialState]; | ||
2870 | caretAnchored = ( sinit->anchors != 0 ); | ||
2871 | if ( caretAnchored ) { | ||
2872 | QMap<int, int>& anchors = *sinit->anchors; | ||
2873 | QMap<int, int>::ConstIterator a; | ||
2874 | for ( a = anchors.begin(); a != anchors.end(); ++a ) { | ||
2875 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | ||
2876 | if ( (*a & Anchor_Alternation) != 0 ) | ||
2877 | break; | ||
2878 | #endif | ||
2879 | if ( (*a & Anchor_Caret) == 0 ) { | ||
2880 | caretAnchored = FALSE; | ||
2881 | break; | ||
2882 | } | ||
2883 | } | ||
2884 | } | ||
2885 | #endif | ||
2886 | return yyPos0; | ||
2887 | } | ||
2888 | |||
2889 | void QRegExpEngine::parseAtom( Box *box ) | ||
2890 | { | ||
2891 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2892 | QRegExpEngine *eng = 0; | ||
2893 | bool neg; | ||
2894 | int len; | ||
2895 | #endif | ||
2896 | |||
2897 | switch ( yyTok ) { | ||
2898 | case Tok_Dollar: | ||
2899 | box->catAnchor( Anchor_Dollar ); | ||
2900 | break; | ||
2901 | case Tok_Caret: | ||
2902 | box->catAnchor( Anchor_Caret ); | ||
2903 | break; | ||
2904 | #ifndef QT_NO_REGEXP_LOOKAHEAD | ||
2905 | case Tok_PosLookahead: | ||
2906 | case Tok_NegLookahead: | ||
2907 | neg = ( yyTok == Tok_NegLookahead ); | ||
2908 | eng = new QRegExpEngine( cs ); | ||
2909 | len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 ); | ||
2910 | if ( len >= 0 ) | ||
2911 | skipChars( len ); | ||
2912 | else | ||
2913 | yyError = TRUE; | ||
2914 | box->catAnchor( addLookahead(eng, neg) ); | ||
2915 | yyTok = getToken(); | ||
2916 | if ( yyTok != Tok_RightParen ) | ||
2917 | yyError = TRUE; | ||
2918 | break; | ||
2919 | #endif | ||
2920 | #ifndef QT_NO_REGEXP_ESCAPE | ||
2921 | case Tok_Word: | ||
2922 | box->catAnchor( Anchor_Word ); | ||
2923 | break; | ||
2924 | case Tok_NonWord: | ||
2925 | box->catAnchor( Anchor_NonWord ); | ||
2926 | break; | ||
2927 | #endif | ||
2928 | case Tok_LeftParen: | ||
2929 | case Tok_MagicLeftParen: | ||
2930 | yyTok = getToken(); | ||
2931 | parseExpression( box ); | ||
2932 | if ( yyTok != Tok_RightParen ) | ||
2933 | yyError = TRUE; | ||
2934 | break; | ||
2935 | case Tok_CharClass: | ||
2936 | box->set( *yyCharClass ); | ||
2937 | break; | ||
2938 | default: | ||
2939 | if ( (yyTok & Tok_Char) != 0 ) | ||
2940 | box->set( QChar(yyTok ^ Tok_Char) ); | ||
2941 | #ifndef QT_NO_REGEXP_BACKREF | ||
2942 | else if ( (yyTok & Tok_BackRef) != 0 ) | ||
2943 | box->set( yyTok ^ Tok_BackRef ); | ||
2944 | #endif | ||
2945 | else | ||
2946 | yyError = TRUE; | ||
2947 | } | ||
2948 | yyTok = getToken(); | ||
2949 | } | ||
2950 | |||
2951 | void QRegExpEngine::parseFactor( Box *box ) | ||
2952 | { | ||
2953 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2954 | int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen ); | ||
2955 | #else | ||
2956 | static const int atom = 0; | ||
2957 | #endif | ||
2958 | |||
2959 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2960 | #define YYREDO() \ | ||
2961 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ | ||
2962 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok | ||
2963 | |||
2964 | const QChar *in = yyIn; | ||
2965 | int pos0 = yyPos0; | ||
2966 | int pos = yyPos; | ||
2967 | int len = yyLen; | ||
2968 | int ch = yyCh; | ||
2969 | CharClass charClass; | ||
2970 | if ( yyTok == Tok_CharClass ) | ||
2971 | charClass = *yyCharClass; | ||
2972 | int tok = yyTok; | ||
2973 | bool mayCapture = yyMayCapture; | ||
2974 | #endif | ||
2975 | |||
2976 | parseAtom( box ); | ||
2977 | #ifndef QT_NO_REGEXP_CAPTURE | ||
2978 | finishAtom( atom ); | ||
2979 | #endif | ||
2980 | |||
2981 | if ( yyTok == Tok_Quantifier ) { | ||
2982 | if ( yyMaxRep == InftyRep ) { | ||
2983 | box->plus( atom ); | ||
2984 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2985 | } else if ( yyMaxRep == 0 ) { | ||
2986 | box->clear(); | ||
2987 | #endif | ||
2988 | } | ||
2989 | if ( yyMinRep == 0 ) | ||
2990 | box->opt(); | ||
2991 | |||
2992 | #ifndef QT_NO_REGEXP_INTERVAL | ||
2993 | yyMayCapture = FALSE; | ||
2994 | int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1; | ||
2995 | int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 ); | ||
2996 | |||
2997 | Box rightBox( this ); | ||
2998 | int i; | ||
2999 | |||
3000 | for ( i = 0; i < beta; i++ ) { | ||
3001 | YYREDO(); | ||
3002 | Box leftBox( this ); | ||
3003 | parseAtom( &leftBox ); | ||
3004 | leftBox.cat( rightBox ); | ||
3005 | leftBox.opt(); | ||
3006 | rightBox = leftBox; | ||
3007 | } | ||
3008 | for ( i = 0; i < alpha; i++ ) { | ||
3009 | YYREDO(); | ||
3010 | Box leftBox( this ); | ||
3011 | parseAtom( &leftBox ); | ||
3012 | leftBox.cat( rightBox ); | ||
3013 | rightBox = leftBox; | ||
3014 | } | ||
3015 | rightBox.cat( *box ); | ||
3016 | *box = rightBox; | ||
3017 | #endif | ||
3018 | yyTok = getToken(); | ||
3019 | #ifndef QT_NO_REGEXP_INTERVAL | ||
3020 | yyMayCapture = mayCapture; | ||
3021 | #endif | ||
3022 | } | ||
3023 | #undef YYREDO | ||
3024 | } | ||
3025 | |||
3026 | void QRegExpEngine::parseTerm( Box *box ) | ||
3027 | { | ||
3028 | #ifndef QT_NO_REGEXP_OPTIM | ||
3029 | if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) | ||
3030 | parseFactor( box ); | ||
3031 | #endif | ||
3032 | while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) { | ||
3033 | Box rightBox( this ); | ||
3034 | parseFactor( &rightBox ); | ||
3035 | box->cat( rightBox ); | ||
3036 | } | ||
3037 | } | ||
3038 | |||
3039 | void QRegExpEngine::parseExpression( Box *box ) | ||
3040 | { | ||
3041 | parseTerm( box ); | ||
3042 | while ( yyTok == Tok_Bar ) { | ||
3043 | Box rightBox( this ); | ||
3044 | yyTok = getToken(); | ||
3045 | parseTerm( &rightBox ); | ||
3046 | box->orx( rightBox ); | ||
3047 | } | ||
3048 | } | ||
3049 | |||
3050 | /* | ||
3051 | The class QRegExpPrivate contains the private data of a regular expression | ||
3052 | other than the automaton. It makes it possible for many QRegExp objects to | ||
3053 | use the same QRegExpEngine object with different QRegExpPrivate objects. | ||
3054 | */ | ||
3055 | struct QRegExpPrivate | ||
3056 | { | ||
3057 | QString pattern; // regular-expression or wildcard pattern | ||
3058 | QString rxpattern; // regular-expression pattern | ||
3059 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3060 | bool wc; // wildcard mode? | ||
3061 | #endif | ||
3062 | bool min; // minimal matching? (instead of maximal) | ||
3063 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3064 | QString t; // last string passed to QRegExp::search() or searchRev() | ||
3065 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last | ||
3066 | #endif | ||
3067 | QArray<int> captured; // what QRegExpEngine::search() returned last | ||
3068 | |||
3069 | QRegExpPrivate() { captured.fill( -1, 2 ); } | ||
3070 | }; | ||
3071 | |||
3072 | #ifndef QT_NO_REGEXP_OPTIM | ||
3073 | static QCache<QRegExpEngine> *engineCache = 0; | ||
3074 | #endif | ||
3075 | |||
3076 | static QRegExpEngine *newEngine( const QString& pattern, bool caseSensitive ) | ||
3077 | { | ||
3078 | #ifndef QT_NO_REGEXP_OPTIM | ||
3079 | if ( engineCache != 0 ) { | ||
3080 | QRegExpEngine *eng = engineCache->take( pattern ); | ||
3081 | if ( eng == 0 || eng->caseSensitive() != caseSensitive ) { | ||
3082 | delete eng; | ||
3083 | } else { | ||
3084 | eng->ref(); | ||
3085 | return eng; | ||
3086 | } | ||
3087 | } | ||
3088 | #endif | ||
3089 | return new QRegExpEngine( pattern, caseSensitive ); | ||
3090 | } | ||
3091 | |||
3092 | static void derefEngine( QRegExpEngine *eng, const QString& pattern ) | ||
3093 | { | ||
3094 | if ( eng != 0 && eng->deref() ) { | ||
3095 | #ifndef QT_NO_REGEXP_OPTIM | ||
3096 | if ( engineCache == 0 ) { | ||
3097 | engineCache = new QCache<QRegExpEngine>; | ||
3098 | engineCache->setAutoDelete( TRUE ); | ||
3099 | } | ||
3100 | if ( !pattern.isNull() && | ||
3101 | engineCache->insert(pattern, eng, 4 + pattern.length() / 4) ) | ||
3102 | return; | ||
3103 | #else | ||
3104 | Q_UNUSED( pattern ); | ||
3105 | #endif | ||
3106 | delete eng; | ||
3107 | } | ||
3108 | } | ||
3109 | |||
3110 | /*! | ||
3111 | Constructs an empty regexp. | ||
3112 | |||
3113 | \sa isValid() | ||
3114 | */ | ||
3115 | QRegExp3::QRegExp3() | ||
3116 | { | ||
3117 | eng = new QRegExpEngine( TRUE ); | ||
3118 | priv = new QRegExpPrivate; | ||
3119 | priv->pattern = QString::null; | ||
3120 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3121 | priv->wc = FALSE; | ||
3122 | #endif | ||
3123 | priv->min = FALSE; | ||
3124 | compile( TRUE ); | ||
3125 | } | ||
3126 | |||
3127 | /*! | ||
3128 | Constructs a regular expression object for the given \a pattern | ||
3129 | string. The pattern must be given using wildcard notation if \a | ||
3130 | wildcard is TRUE (default is FALSE). The pattern is case sensitive, | ||
3131 | unless \a caseSensitive is FALSE. Matching is greedy (maximal), but | ||
3132 | can be changed by calling setMinimal(). | ||
3133 | |||
3134 | \sa setPattern() setCaseSensitive() setWildcard() setMinimal() | ||
3135 | */ | ||
3136 | QRegExp3::QRegExp3( const QString& pattern, bool caseSensitive, bool wildcard ) | ||
3137 | { | ||
3138 | eng = 0; | ||
3139 | priv = new QRegExpPrivate; | ||
3140 | priv->pattern = pattern; | ||
3141 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3142 | priv->wc = wildcard; | ||
3143 | #endif | ||
3144 | priv->min = FALSE; | ||
3145 | compile( caseSensitive ); | ||
3146 | } | ||
3147 | |||
3148 | /*! | ||
3149 | Constructs a regular expression as a copy of \a rx. | ||
3150 | |||
3151 | \sa operator=() | ||
3152 | */ | ||
3153 | QRegExp3::QRegExp3( const QRegExp3& rx ) | ||
3154 | { | ||
3155 | eng = 0; | ||
3156 | priv = new QRegExpPrivate; | ||
3157 | operator=( rx ); | ||
3158 | } | ||
3159 | |||
3160 | /*! | ||
3161 | Destroys the regular expression and cleans up its internal data. | ||
3162 | */ | ||
3163 | QRegExp3::~QRegExp3() | ||
3164 | { | ||
3165 | derefEngine( eng, priv->rxpattern ); | ||
3166 | delete priv; | ||
3167 | } | ||
3168 | |||
3169 | /*! | ||
3170 | Copies the regular expression \a rx and returns a reference to the copy. | ||
3171 | The case sensitivity, wildcard and minimal matching options are copied as | ||
3172 | well. | ||
3173 | */ | ||
3174 | QRegExp3& QRegExp3::operator=( const QRegExp3& rx ) | ||
3175 | { | ||
3176 | rx.eng->ref(); | ||
3177 | derefEngine( eng, priv->rxpattern ); | ||
3178 | eng = rx.eng; | ||
3179 | priv->pattern = rx.priv->pattern; | ||
3180 | priv->rxpattern = rx.priv->rxpattern; | ||
3181 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3182 | priv->wc = rx.priv->wc; | ||
3183 | #endif | ||
3184 | priv->min = rx.priv->min; | ||
3185 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3186 | priv->t = rx.priv->t; | ||
3187 | priv->capturedCache = rx.priv->capturedCache; | ||
3188 | #endif | ||
3189 | priv->captured = rx.priv->captured; | ||
3190 | return *this; | ||
3191 | } | ||
3192 | |||
3193 | /*! | ||
3194 | Returns TRUE if this regular expression is equal to \a rx, otherwise | ||
3195 | returns FALSE. | ||
3196 | |||
3197 | Two QRegExp3 objects are equal if they have the same pattern strings | ||
3198 | and the same settings for case sensitivity, wildcard and minimal | ||
3199 | matching. | ||
3200 | */ | ||
3201 | bool QRegExp3::operator==( const QRegExp3& rx ) const | ||
3202 | { | ||
3203 | return priv->pattern == rx.priv->pattern && | ||
3204 | eng->caseSensitive() == rx.eng->caseSensitive() && | ||
3205 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3206 | priv->wc == rx.priv->wc && | ||
3207 | #endif | ||
3208 | priv->min == rx.priv->min; | ||
3209 | } | ||
3210 | |||
3211 | /*! \fn bool QRegExp3::operator!=( const QRegExp& rx ) const | ||
3212 | |||
3213 | Returns TRUE if this regular expression is not equal to \a rx, otherwise | ||
3214 | FALSE. | ||
3215 | |||
3216 | \sa operator==() | ||
3217 | */ | ||
3218 | |||
3219 | /*! | ||
3220 | Returns TRUE if the pattern string is empty, otherwise FALSE. | ||
3221 | |||
3222 | If you call match() with an empty pattern on an empty string it will | ||
3223 | return TRUE otherwise it returns FALSE since match() operates over the | ||
3224 | whole string. If you call search() with an empty pattern on \e any | ||
3225 | string it will return the start position (0 by default) since it will | ||
3226 | match at the start position, because the empty pattern matches the | ||
3227 | 'emptiness' at the start of the string, and the length of the match | ||
3228 | returned by matchedLength() will be 0. | ||
3229 | |||
3230 | See QString::isEmpty(). | ||
3231 | */ | ||
3232 | |||
3233 | bool QRegExp3::isEmpty() const | ||
3234 | { | ||
3235 | return priv->pattern.isEmpty(); | ||
3236 | } | ||
3237 | |||
3238 | /*! | ||
3239 | Returns TRUE if the regular expression is valid, or FALSE if it's invalid. An | ||
3240 | invalid regular expression never matches. | ||
3241 | |||
3242 | The pattern <b>[a-z</b> is an example of an invalid pattern, since it lacks | ||
3243 | a closing square bracket. | ||
3244 | |||
3245 | Note that the validity of a regexp may also depend on the setting of | ||
3246 | the wildcard flag, for example <b>*.html</b> is a valid wildcard | ||
3247 | regexp but an invalid full regexp. | ||
3248 | */ | ||
3249 | bool QRegExp3::isValid() const | ||
3250 | { | ||
3251 | return eng->isValid(); | ||
3252 | } | ||
3253 | |||
3254 | /*! | ||
3255 | Returns the pattern string of the regular expression. The pattern has either | ||
3256 | regular expression syntax or wildcard syntax, depending on wildcard(). | ||
3257 | |||
3258 | \sa setPattern() | ||
3259 | */ | ||
3260 | QString QRegExp3::pattern() const | ||
3261 | { | ||
3262 | return priv->pattern; | ||
3263 | } | ||
3264 | |||
3265 | /*! | ||
3266 | Sets the pattern string to \a pattern and returns a reference to this regular | ||
3267 | expression. The case sensitivity, wildcard and minimal matching options are | ||
3268 | not changed. | ||
3269 | |||
3270 | \sa pattern() | ||
3271 | */ | ||
3272 | void QRegExp3::setPattern( const QString& pattern ) | ||
3273 | { | ||
3274 | if ( priv->pattern != pattern ) { | ||
3275 | priv->pattern = pattern; | ||
3276 | compile( caseSensitive() ); | ||
3277 | } | ||
3278 | } | ||
3279 | |||
3280 | /*! | ||
3281 | Returns TRUE if case sensitivity is enabled, otherwise FALSE. The default is | ||
3282 | TRUE. | ||
3283 | |||
3284 | \sa setCaseSensitive() | ||
3285 | */ | ||
3286 | bool QRegExp3::caseSensitive() const | ||
3287 | { | ||
3288 | return eng->caseSensitive(); | ||
3289 | } | ||
3290 | |||
3291 | /*! | ||
3292 | Sets case sensitive matching to \a sensitive. | ||
3293 | |||
3294 | If \a sensitive is TRUE, <b>\\</b><b>.txt$</b> matches | ||
3295 | <tt>readme.txt</tt> but not <tt>README.TXT</tt>. | ||
3296 | |||
3297 | \sa caseSensitive() | ||
3298 | */ | ||
3299 | void QRegExp3::setCaseSensitive( bool sensitive ) | ||
3300 | { | ||
3301 | if ( sensitive != eng->caseSensitive() ) | ||
3302 | compile( sensitive ); | ||
3303 | } | ||
3304 | |||
3305 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3306 | /*! | ||
3307 | Returns TRUE if wildcard mode is enabled, otherwise FALSE. The default is | ||
3308 | FALSE. | ||
3309 | |||
3310 | \sa setWildcard() | ||
3311 | */ | ||
3312 | bool QRegExp3::wildcard() const | ||
3313 | { | ||
3314 | return priv->wc; | ||
3315 | } | ||
3316 | |||
3317 | /*! Sets the wildcard mode for the regular expression. The default is FALSE. | ||
3318 | |||
3319 | Setting \a wildcard to TRUE enables simple shell-like wildcard | ||
3320 | matching. | ||
3321 | (See <a href="#wildcard-matching">wildcard matching (globbing)</a>.) | ||
3322 | |||
3323 | For example, <b>r*.txt</b> matches the string <tt>readme.txt</tt> in wildcard | ||
3324 | mode, but does not match <tt>readme</tt>. | ||
3325 | |||
3326 | \sa wildcard() | ||
3327 | */ | ||
3328 | void QRegExp3::setWildcard( bool wildcard ) | ||
3329 | { | ||
3330 | if ( wildcard != priv->wc ) { | ||
3331 | priv->wc = wildcard; | ||
3332 | compile( caseSensitive() ); | ||
3333 | } | ||
3334 | } | ||
3335 | #endif | ||
3336 | |||
3337 | /*! Returns TRUE if minimal (non-greedy) matching is enabled, otherwise | ||
3338 | returns FALSE. | ||
3339 | |||
3340 | \sa setMinimal() | ||
3341 | */ | ||
3342 | bool QRegExp3::minimal() const | ||
3343 | { | ||
3344 | return priv->min; | ||
3345 | } | ||
3346 | |||
3347 | /*! | ||
3348 | Enables or disables minimal matching. If \a minimal is FALSE, matching is | ||
3349 | greedy (maximal) which is the default. | ||
3350 | |||
3351 | For example, suppose we have the input string "We must be \<b>bold\</b>, | ||
3352 | very \<b>bold\</b>!" and the pattern <b>\<b>.*\</b></b>. With | ||
3353 | the default greedy (maximal) matching, the match is | ||
3354 | "We must be <u>\<b>bold\</b>, very \<b>bold\</b></u>!". But with | ||
3355 | minimal (non-greedy) matching the first match is: | ||
3356 | "We must be <u>\<b>bold\</b></u>, very \<b>bold\</b>!" and the | ||
3357 | second match is | ||
3358 | "We must be \<b>bold\</b>, very <u>\<b>bold\</b></u>!". | ||
3359 | In practice we might use the pattern <b>\<b>[^\<]+\</b></b>, | ||
3360 | although this will still fail for nested tags. | ||
3361 | |||
3362 | \sa minimal() | ||
3363 | */ | ||
3364 | void QRegExp3::setMinimal( bool minimal ) | ||
3365 | { | ||
3366 | priv->min = minimal; | ||
3367 | } | ||
3368 | |||
3369 | /*! | ||
3370 | Returns TRUE if \a str is matched exactly by this regular expression | ||
3371 | otherwise it returns FALSE. You can determine how much of the string was | ||
3372 | matched by calling matchedLength(). | ||
3373 | |||
3374 | For a given regexp string, R, <tt>match("R")</tt> is the equivalent | ||
3375 | of <tt>search("^R$")</tt> since match() effectively encloses the | ||
3376 | regexp in the start of string and end of string anchors. | ||
3377 | |||
3378 | For example, if the regular expression is <b>blue</b>, then match() | ||
3379 | returns TRUE only for input <tt>blue</tt>. For inputs | ||
3380 | <tt>bluebell</tt>, <tt>blutak</tt> and <tt>lightblue</tt>, match() | ||
3381 | returns FALSE and matchedLength() will return 4, 3 and 0 respectively. | ||
3382 | |||
3383 | \sa search() searchRev() QRegExpValidator | ||
3384 | */ | ||
3385 | bool QRegExp3::exactMatch( const QString& str ) | ||
3386 | { | ||
3387 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3388 | priv->t = str; | ||
3389 | priv->capturedCache.clear(); | ||
3390 | #endif | ||
3391 | |||
3392 | priv->captured = eng->match( str, 0, priv->min, TRUE ); | ||
3393 | if ( priv->captured[1] == (int) str.length() ) { | ||
3394 | return TRUE; | ||
3395 | } else { | ||
3396 | priv->captured.detach(); | ||
3397 | priv->captured[0] = 0; | ||
3398 | priv->captured[1] = eng->matchedLength(); | ||
3399 | return FALSE; | ||
3400 | } | ||
3401 | } | ||
3402 | |||
3403 | /*! \overload | ||
3404 | |||
3405 | This version does not set matchedLength(), capturedTexts() and friends. | ||
3406 | */ | ||
3407 | bool QRegExp3::exactMatch( const QString& str ) const | ||
3408 | { | ||
3409 | return eng->match(str, 0, priv->min, TRUE)[0] == 0 && | ||
3410 | eng->matchedLength() == (int) str.length(); | ||
3411 | } | ||
3412 | |||
3413 | /*! \obsolete | ||
3414 | |||
3415 | Attempts to match in \a str, starting from position \a index. Returns the | ||
3416 | position of the match, or -1 if there was no match. | ||
3417 | |||
3418 | The length of the match is stored in \a *len, unless \a len is a null pointer. | ||
3419 | |||
3420 | If \a indexIsStart is TRUE (the default), the position \a index in the string | ||
3421 | will match the start of string anchor, <b>^</b>, in the regexp, if present. | ||
3422 | Otherwise, position 0 in \a str will match. | ||
3423 | |||
3424 | Use search() and matchedLength() instead of this function. | ||
3425 | |||
3426 | If you really need the \a indexIsStart functionality, try this: | ||
3427 | |||
3428 | \code | ||
3429 | QRegExp3 rx( "some pattern" ); | ||
3430 | int pos = rx.search( str.mid( index ) ); | ||
3431 | if ( pos != -1 ) | ||
3432 | pos += index; | ||
3433 | int len = rx.matchedLength(); | ||
3434 | \endcode | ||
3435 | */ | ||
3436 | #ifndef QT_NO_COMPAT | ||
3437 | int QRegExp3::match( const QString& str, int index, int *len, | ||
3438 | bool indexIsStart ) | ||
3439 | { | ||
3440 | int pos; | ||
3441 | if ( indexIsStart ) { | ||
3442 | pos = search( str.mid(index) ); | ||
3443 | if ( pos >= 0 ) { | ||
3444 | pos += index; | ||
3445 | if ( len != 0 ) | ||
3446 | *len = matchedLength(); | ||
3447 | } else { | ||
3448 | if ( len != 0 ) | ||
3449 | *len = 0; | ||
3450 | } | ||
3451 | } else { | ||
3452 | pos = search( str, index ); | ||
3453 | if ( len != 0 ) | ||
3454 | *len = matchedLength(); | ||
3455 | } | ||
3456 | return pos; | ||
3457 | } | ||
3458 | #endif | ||
3459 | |||
3460 | /*! | ||
3461 | Attempts to find a match in \a str from position \a start (0 by default). If | ||
3462 | \a start is -1, the search starts at the last character; if -2, at the next to | ||
3463 | last character; etc. | ||
3464 | |||
3465 | Returns the position of the first match, or -1 if there was no match. | ||
3466 | |||
3467 | You might prefer to use QString::find(), QString::contains() or even | ||
3468 | QStringList::grep(). To replace matches use QString::replace(). | ||
3469 | |||
3470 | Example: | ||
3471 | \code | ||
3472 | QString str = "offsets: 1.23 .50 71.00 6.00"; | ||
3473 | QRegExp3 rx( "\\d*\\.\\d+" ); // very simple floating point matching | ||
3474 | int count = 0; | ||
3475 | int pos = 0; | ||
3476 | while ( pos >= 0 ) { | ||
3477 | pos = rx.search( str, pos ); | ||
3478 | count++; | ||
3479 | } | ||
3480 | // pos will be 9, 14, 18 and finally 24; count will end up as 4. | ||
3481 | \endcode | ||
3482 | |||
3483 | \sa searchRev() match() matchedLength() capturedTexts() | ||
3484 | */ | ||
3485 | // QChar versions | ||
3486 | |||
3487 | #ifdef QCHAR_SUPPORT | ||
3488 | const QString makeString(const QChar *str) | ||
3489 | { | ||
3490 | // A sentinel value checked in case the QChar *ptr is never null terminated | ||
3491 | const uint MAXLENGTH=65535; | ||
3492 | |||
3493 | const QChar *s=str; | ||
3494 | uint i=0; | ||
3495 | while(i < MAXLENGTH && *s != QChar::null) { i++;s++ ;} | ||
3496 | return QString(str,i); | ||
3497 | |||
3498 | } | ||
3499 | int QRegExp3::search(const QChar *str,int start) | ||
3500 | { | ||
3501 | return search(makeString(str),start); | ||
3502 | } | ||
3503 | int QRegExp3::search(const QChar *str,int start) const | ||
3504 | { | ||
3505 | return search(makeString(str),start); | ||
3506 | } | ||
3507 | int QRegExp3::searchRev(const QChar *str,int start) | ||
3508 | { | ||
3509 | return searchRev(makeString(str),start); | ||
3510 | } | ||
3511 | int QRegExp3::searchRev(const QChar *str,int start) const | ||
3512 | { | ||
3513 | return searchRev(makeString(str),start); | ||
3514 | } | ||
3515 | bool QRegExp3::exactMatch(const QChar *str) | ||
3516 | { | ||
3517 | return exactMatch(makeString(str)); | ||
3518 | } | ||
3519 | bool QRegExp3::exactMatch(const QChar *str) const | ||
3520 | { | ||
3521 | return exactMatch(makeString(str)); | ||
3522 | } | ||
3523 | #endif // QCHAR_SUPPORT | ||
3524 | |||
3525 | int QRegExp3::search( const QString& str, int start ) | ||
3526 | { | ||
3527 | if ( start < 0 ) | ||
3528 | start += str.length(); | ||
3529 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3530 | priv->t = str; | ||
3531 | priv->capturedCache.clear(); | ||
3532 | #endif | ||
3533 | priv->captured = eng->match( str, start, priv->min, FALSE ); | ||
3534 | return priv->captured[0]; | ||
3535 | } | ||
3536 | |||
3537 | /*! \overload | ||
3538 | |||
3539 | This version does not set matchedLength(), capturedTexts() and friends. | ||
3540 | */ | ||
3541 | int QRegExp3::search( const QString& str, int start ) const | ||
3542 | { | ||
3543 | if ( start < 0 ) | ||
3544 | start += str.length(); | ||
3545 | return eng->match( str, start, priv->min, FALSE )[0]; | ||
3546 | } | ||
3547 | |||
3548 | /*! | ||
3549 | Attempts to find a match backwards in \a str from position \a start. If | ||
3550 | \a start is -1 (the default), the search starts at the last character; if -2, | ||
3551 | at the next to last character; etc. | ||
3552 | |||
3553 | Returns the position of the first match, or -1 if there was no match. | ||
3554 | |||
3555 | You might prefer to use QString::findRev(). | ||
3556 | |||
3557 | \sa search() matchedLength() capturedTexts() | ||
3558 | */ | ||
3559 | int QRegExp3::searchRev( const QString& str, int start ) | ||
3560 | { | ||
3561 | if ( start < 0 ) | ||
3562 | start += str.length(); | ||
3563 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3564 | priv->t = str; | ||
3565 | priv->capturedCache.clear(); | ||
3566 | #endif | ||
3567 | if ( start < 0 || start > (int) str.length() ) { | ||
3568 | priv->captured.detach(); | ||
3569 | priv->captured.fill( -1 ); | ||
3570 | return -1; | ||
3571 | } | ||
3572 | |||
3573 | while ( start >= 0 ) { | ||
3574 | priv->captured = eng->match( str, start, priv->min, TRUE ); | ||
3575 | if ( priv->captured[0] == start ) | ||
3576 | return start; | ||
3577 | start--; | ||
3578 | } | ||
3579 | return -1; | ||
3580 | } | ||
3581 | |||
3582 | /*! \overload | ||
3583 | |||
3584 | This version does not set matchedLength(), capturedText() and friends. | ||
3585 | */ | ||
3586 | int QRegExp3::searchRev( const QString& str, int start ) const | ||
3587 | { | ||
3588 | if ( start < 0 ) | ||
3589 | start += str.length(); | ||
3590 | if ( start < 0 || start > (int) str.length() ) | ||
3591 | return -1; | ||
3592 | |||
3593 | while ( start >= 0 ) { | ||
3594 | if ( eng->match(str, start, priv->min, TRUE)[0] == start ) | ||
3595 | return start; | ||
3596 | start--; | ||
3597 | } | ||
3598 | return -1; | ||
3599 | } | ||
3600 | |||
3601 | /*! | ||
3602 | Returns the length of the last matched string, or -1 if there was no match. | ||
3603 | |||
3604 | \sa match() search() | ||
3605 | */ | ||
3606 | int QRegExp3::matchedLength() | ||
3607 | { | ||
3608 | return priv->captured[1]; | ||
3609 | } | ||
3610 | |||
3611 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3612 | /*! | ||
3613 | Returns a list of the captured text strings. | ||
3614 | |||
3615 | The first string in the list is the entire matched string. Each | ||
3616 | subsequent list element contains a string that matched a | ||
3617 | (capturing) subexpression of the regexp. | ||
3618 | |||
3619 | For example: | ||
3620 | \code | ||
3621 | QRegExp3 rx( "(\\d+)(\\s*)(cm|inch(es)?)" ); | ||
3622 | int pos = rx.search( "Length: 36 inches" ); | ||
3623 | QStringList list = rx.capturedTexts(); | ||
3624 | // list is now ( "36 inches", "36", " ", "inches", "es" ). | ||
3625 | \endcode | ||
3626 | |||
3627 | The above example also captures elements | ||
3628 | that may be present but which we have no interest in. This problem | ||
3629 | can be solved by using non-capturing parenthesis: | ||
3630 | |||
3631 | \code | ||
3632 | QRegExp3 rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" ); | ||
3633 | int pos = rx.search( "Length: 36 inches" ); | ||
3634 | QStringList list = rx.capturedTexts(); | ||
3635 | // list is now ( "36 inches", "36", "inches" ). | ||
3636 | \endcode | ||
3637 | |||
3638 | Some regexps can match an indeterminate number of times. For example | ||
3639 | if the input string is "Offsets: 12 14 99 231 7" and the regexp, | ||
3640 | <tt>rx</tt>, is <b>(</b><b>\\</b><b>d+)+</b>, we would hope to get a | ||
3641 | list of all the numbers matched. However, after calling | ||
3642 | <tt>rx.search(str)</tt>, capturedTexts() will return the list ( "12", | ||
3643 | "12" ), i.e. the entire match was "12" and the first subexpression | ||
3644 | matched was "12". The correct approach is to use cap() in a | ||
3645 | <a href="#cap_in_a_loop">loop</a>. | ||
3646 | |||
3647 | The order of elements in the string list is as follows. The first | ||
3648 | element is the entire matching string. Each subsequent element | ||
3649 | corresponds to the next capturing open left parenthesis. Thus | ||
3650 | capturedTexts()[1] is the text of the first capturing parenthesis, | ||
3651 | capturedTexts()[2] is the text of the second and so on (corresponding | ||
3652 | to $1, $2 etc. in some other regexp languages). | ||
3653 | |||
3654 | \sa cap() pos() | ||
3655 | */ | ||
3656 | QStringList QRegExp3::capturedTexts() | ||
3657 | { | ||
3658 | if ( priv->capturedCache.isEmpty() ) { | ||
3659 | for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) { | ||
3660 | QString m; | ||
3661 | if ( priv->captured[i + 1] == 0 ) | ||
3662 | m = QString::fromLatin1( "" ); | ||
3663 | else if ( priv->captured[i] >= 0 ) | ||
3664 | m = priv->t.mid( priv->captured[i], | ||
3665 | priv->captured[i + 1] ); | ||
3666 | priv->capturedCache.append( m ); | ||
3667 | } | ||
3668 | priv->t = QString::null; | ||
3669 | } | ||
3670 | return priv->capturedCache; | ||
3671 | } | ||
3672 | |||
3673 | /*! Returns the text captured by the \a nth subexpression. The entire match | ||
3674 | has index 0 and the parenthesised subexpressions have indices starting | ||
3675 | from 1 (excluding non-capturing parenthesis). | ||
3676 | |||
3677 | \code | ||
3678 | QRegExp3 rxlen( "(\\d+)(?:\\s*)(cm|inch)" ); | ||
3679 | int pos = rxlen.search( "Length: 189cm" ); | ||
3680 | if ( pos > -1 ) { | ||
3681 | QString value = rxlen.cap( 1 );// "189" | ||
3682 | QString unit = rxlen.cap( 2 ); // "cm" | ||
3683 | // ... | ||
3684 | } | ||
3685 | \endcode | ||
3686 | |||
3687 | <a name="cap_in_a_loop"> | ||
3688 | Some patterns may lead to a number of matches which cannot be | ||
3689 | determined in advance, for example:</a> | ||
3690 | |||
3691 | \code | ||
3692 | QRegExp3 rx( "(\\d+)" ); | ||
3693 | str = "Offsets: 12 14 99 231 7"; | ||
3694 | QStringList list; | ||
3695 | pos = 0; | ||
3696 | while ( pos >= 0 ) { | ||
3697 | pos = rx.search( str, pos ); | ||
3698 | if ( pos > -1 ) { | ||
3699 | list += rx.cap( 1 ); | ||
3700 | pos += rx.matchedLength(); | ||
3701 | } | ||
3702 | } | ||
3703 | // list contains: ( "12", "14", "99", "231", "7" ). | ||
3704 | \endcode | ||
3705 | |||
3706 | The order of elements matched by cap() is as follows. The first | ||
3707 | element, cap( 0 ), is the entire matching string. Each subsequent | ||
3708 | element corresponds to the next capturing open left parenthesis. Thus | ||
3709 | cap( 1 ) is the text of the first capturing parenthesis, cap( 2 ) is | ||
3710 | the text of the second and so on. | ||
3711 | |||
3712 | \sa search() pos() capturedTexts() | ||
3713 | */ | ||
3714 | QString QRegExp3::cap( int nth ) | ||
3715 | { | ||
3716 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) | ||
3717 | return QString::null; | ||
3718 | else | ||
3719 | return capturedTexts()[nth]; | ||
3720 | } | ||
3721 | |||
3722 | /*! Returns the position of the \a nth captured text in the | ||
3723 | searched string. If \a nth is 0 (the default), pos() returns the | ||
3724 | position of the whole match. | ||
3725 | |||
3726 | Example: | ||
3727 | \code | ||
3728 | QRegExp3 rx( "/([a-z]+)/([a-z]+)" ); | ||
3729 | rx.search( "Output /dev/null" );// Returns 7 (position of /dev/null) | ||
3730 | rx.pos( 0 ); // Returns 7 (position of /dev/null) | ||
3731 | rx.pos( 1 ); // Returns 8 (position of dev) | ||
3732 | rx.pos( 2 ); // Returns 12 (position of null) | ||
3733 | \endcode | ||
3734 | |||
3735 | Note that pos() returns -1 for zero-length matches. (For example, if | ||
3736 | cap(4) would return an empty string, pos(4) returns -1.) This is due | ||
3737 | to an implementation tradeoff. | ||
3738 | |||
3739 | \sa capturedTexts() cap() | ||
3740 | */ | ||
3741 | int QRegExp3::pos( int nth ) | ||
3742 | { | ||
3743 | if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) | ||
3744 | return -1; | ||
3745 | else | ||
3746 | return priv->captured[2 * nth]; | ||
3747 | } | ||
3748 | #endif | ||
3749 | |||
3750 | void QRegExp3::compile( bool caseSensitive ) | ||
3751 | { | ||
3752 | derefEngine( eng, priv->rxpattern ); | ||
3753 | #ifndef QT_NO_REGEXP_WILDCARD | ||
3754 | if ( priv->wc ) | ||
3755 | priv->rxpattern = wc2rx( priv->pattern ); | ||
3756 | else | ||
3757 | #endif | ||
3758 | priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" ) | ||
3759 | : priv->pattern; | ||
3760 | eng = newEngine( priv->rxpattern, caseSensitive ); | ||
3761 | #ifndef QT_NO_REGEXP_CAPTURE | ||
3762 | priv->t = QString::null; | ||
3763 | priv->capturedCache.clear(); | ||
3764 | #endif | ||
3765 | priv->captured.detach(); | ||
3766 | priv->captured.fill( -1, 2 + 2 * eng->numCaptures() ); | ||
3767 | } | ||