author | Michael Krelin <hacker@klever.net> | 2004-07-05 01:53:09 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2004-07-05 01:53:09 (UTC) |
commit | 9938ea772d372ba955b7bc1f1c48a062c3a36f8a (patch) (unidiff) | |
tree | a98185df944e3a763f9557aa74bc2837b343f048 /shared-code/RegEx.cpp | |
download | pumpkin-9938ea772d372ba955b7bc1f1c48a062c3a36f8a.zip pumpkin-9938ea772d372ba955b7bc1f1c48a062c3a36f8a.tar.gz pumpkin-9938ea772d372ba955b7bc1f1c48a062c3a36f8a.tar.bz2 |
initial commit into svn repository
git-svn-id: http://svn.klever.net/kin/pumpkin/trunk@1 fe716a7a-6dde-0310-88d9-d003556173a8
-rw-r--r-- | shared-code/RegEx.cpp | 1697 |
1 files changed, 1697 insertions, 0 deletions
diff --git a/shared-code/RegEx.cpp b/shared-code/RegEx.cpp new file mode 100644 index 0000000..b7bab62 --- a/dev/null +++ b/shared-code/RegEx.cpp | |||
@@ -0,0 +1,1697 @@ | |||
1 | #include "../stdafx.h" | ||
2 | #include "RegEx.h" | ||
3 | |||
4 | #defineisWordableChar(c) (isalnum(c) || (c)=='_') | ||
5 | |||
6 | BOOL CRegEx::Compile(LPCTSTR regex,int flags) | ||
7 | { | ||
8 | ASSERT(!((flags®Extended) && (flags®Literal))); | ||
9 | m_Matches.RemoveAll(); | ||
10 | m_Strip.RemoveAll(); | ||
11 | m_Strip.SetSize(0,15); | ||
12 | m_Pattern=regex; | ||
13 | m_ParsePointer=0; | ||
14 | m_Error=0; | ||
15 | m_Sets.RemoveAll(); | ||
16 | m_Flags=flags; | ||
17 | m_iFlags=0; | ||
18 | m_BOLs=m_EOLs=0; | ||
19 | m_Subexps = 0; | ||
20 | m_Categories=1;// 0 is 'everything else' | ||
21 | m_bBackRefs=FALSE; | ||
22 | memset(m_Category,0,sizeof(m_Category)); | ||
23 | |||
24 | // Go ahead. | ||
25 | m_Error || m_Strip.Add(CSop(CSop::opEnd)); | ||
26 | if(flags®Extended){ | ||
27 | ParseERE(); | ||
28 | }else if(flags®Literal){ | ||
29 | ParseLiteral(); | ||
30 | }else{ | ||
31 | ParseBRE(); | ||
32 | } | ||
33 | m_Error || m_Strip.Add(CSop(CSop::opEnd)); | ||
34 | Categorize(); | ||
35 | m_Strip.FreeExtra(); | ||
36 | FigureMust(); | ||
37 | m_Pluses=CountPluses(); | ||
38 | if(m_iFlags&iflagsBad){ | ||
39 | m_Error || (m_Error=regeAssert); | ||
40 | // ??? point to nuls? ;-) | ||
41 | } | ||
42 | // We may wish to free some memory here if we're erroneous (ie. m_Error..) | ||
43 | m_ParseParens.RemoveAll(); | ||
44 | #ifdef _DEBUG | ||
45 | if(m_Error){ | ||
46 | CString tmp; | ||
47 | tmp.Format("RE: ParseError: %d\n",m_Error); | ||
48 | TRACE0(tmp); | ||
49 | } | ||
50 | //DumpStrip(afxDump); | ||
51 | #endif | ||
52 | return (m_bCompiled=(!m_Error)); | ||
53 | } | ||
54 | |||
55 | BOOL CRegEx::Match(LPCTSTR src,int flags) | ||
56 | { | ||
57 | if(!m_bCompiled) | ||
58 | return FALSE; | ||
59 | if(m_iFlags&iflagsBad) | ||
60 | return FALSE; | ||
61 | m_Input=src; | ||
62 | m_mFlags=flags; | ||
63 | m_mPointer=m_Input; | ||
64 | m_mBegin=m_Input; | ||
65 | m_mEnd=&m_mBegin[m_Input.GetLength()]; | ||
66 | ASSERT(m_mPointer<=m_mEnd); | ||
67 | m_Matches.RemoveAll(); | ||
68 | if(!m_Must.IsEmpty()){ | ||
69 | if(m_Input.Find(m_Must)<0) | ||
70 | return FALSE; | ||
71 | } | ||
72 | // Go ahead.. | ||
73 | int stripLen = m_Strip.GetSize(); | ||
74 | m_mLastPos.SetSize(0); | ||
75 | for(int tmp=0;tmp<stripLen;tmp++) | ||
76 | m_Strip[tmp].m_MatchData=0; | ||
77 | LPCTSTR beginp = m_mBegin; | ||
78 | LPCTSTR endp; | ||
79 | for(;;){ | ||
80 | endp = MatchFast(beginp); | ||
81 | if(!endp) | ||
82 | return FALSE; | ||
83 | if((m_mFlags®NoSubExpressions) && !m_bBackRefs) | ||
84 | break; | ||
85 | ASSERT(m_cOldP); | ||
86 | for(;;){ | ||
87 | endp = MatchSlow(m_cOldP,m_mEnd,1,stripLen-1); | ||
88 | if(endp) | ||
89 | break; | ||
90 | ASSERT(m_cOldP<m_mEnd); | ||
91 | m_cOldP++; | ||
92 | } | ||
93 | if((m_mFlags®OneMatch) && !m_bBackRefs) | ||
94 | break; | ||
95 | // Oh, his, we want the subexpression.. | ||
96 | m_Matches.SetSize(m_Subexps+1); | ||
97 | LPCTSTR dp; | ||
98 | if(!m_bBackRefs && !(m_mFlags®BackRefs)){ | ||
99 | dp = MatchDissect(m_cOldP,endp,1,stripLen-1); | ||
100 | }else{ | ||
101 | if(m_Pluses>0 && !m_mLastPos.GetSize()) | ||
102 | m_mLastPos.SetSize(m_Pluses); | ||
103 | dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); | ||
104 | } | ||
105 | if(dp) | ||
106 | break; | ||
107 | // Uh.. oh.. we couldn't find a subexpression-level match | ||
108 | ASSERT(m_bBackRefs); | ||
109 | ASSERT(m_Pluses==0 || m_mLastPos.GetSize()); | ||
110 | for(;;){ | ||
111 | if(dp || endp <= m_cOldP) | ||
112 | break;// defeat.. ? | ||
113 | endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1); | ||
114 | if(!endp) | ||
115 | break;// defeat.. ? | ||
116 | // Try it on a shorter possibility.. | ||
117 | #ifdef _DEBUG | ||
118 | for(tmp=1;tmp<=m_Subexps;tmp++) | ||
119 | ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0); | ||
120 | #endif | ||
121 | dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); | ||
122 | } | ||
123 | ASSERT((!dp) || dp==endp); | ||
124 | if(dp)// Found a shorter one.. | ||
125 | break; | ||
126 | // Despite initial appearances, there is no match here | ||
127 | beginp = m_cOldP+1; | ||
128 | ASSERT(beginp<=m_mEnd); | ||
129 | } | ||
130 | // Fill in the detail if so requested.. | ||
131 | if(!(m_mFlags®NoSubExpressions)){ | ||
132 | if(!m_Matches.GetSize()) | ||
133 | m_Matches.SetSize(1); | ||
134 | m_Matches[0].m_Begin=m_cOldP-m_mBegin; | ||
135 | m_Matches[0].m_End=endp-m_mBegin; | ||
136 | } | ||
137 | m_mLastPos.RemoveAll(); | ||
138 | return TRUE; | ||
139 | } | ||
140 | |||
141 | CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags) | ||
142 | { | ||
143 | // *** | ||
144 | return CString(); | ||
145 | } | ||
146 | |||
147 | void CRegEx::ParseERE(int stop) | ||
148 | { | ||
149 | UCHAR c; | ||
150 | BOOL first=TRUE; | ||
151 | int prevF, prevB; | ||
152 | for(;;){ | ||
153 | int co = m_Strip.GetSize(); | ||
154 | while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop) | ||
155 | ParseEREexp(); | ||
156 | if(m_Strip.GetSize()==co){ | ||
157 | m_Error || (m_Error=regeEmpty); | ||
158 | // ??? point to nuls? | ||
159 | } | ||
160 | if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|') | ||
161 | break; | ||
162 | else | ||
163 | m_ParsePointer++; | ||
164 | if(first){ | ||
165 | StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1)); | ||
166 | prevF = prevB = co; | ||
167 | first=FALSE; | ||
168 | } | ||
169 | m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB)); | ||
170 | prevB = m_Strip.GetSize()-1; | ||
171 | m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); | ||
172 | prevF = m_Strip.GetSize(); | ||
173 | m_Error || m_Strip.Add(CSop(CSop::opOr1,0));// offset isn't really right.. very so.. | ||
174 | } | ||
175 | if(!first){ | ||
176 | m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); | ||
177 | m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB)); | ||
178 | } | ||
179 | ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop); | ||
180 | } | ||
181 | |||
182 | void CRegEx::ParseEREexp() | ||
183 | { | ||
184 | ASSERT(m_ParsePointer < m_Pattern.GetLength()); | ||
185 | UCHAR c = m_Pattern[m_ParsePointer++]; | ||
186 | int pos = m_Strip.GetSize(); | ||
187 | int subno; | ||
188 | int count, count2; | ||
189 | BOOL wascaret=FALSE; | ||
190 | switch(c){ | ||
191 | case '(': | ||
192 | if(!(m_ParsePointer<m_Pattern.GetLength())){ | ||
193 | TRACE0("RE: '(' at the end of the pattern\n"); | ||
194 | if(!m_Error) | ||
195 | m_Error = regeParen; | ||
196 | // ??? point to nuls? | ||
197 | } | ||
198 | m_Subexps++; | ||
199 | subno=m_Subexps; | ||
200 | m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize())); | ||
201 | m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno)); | ||
202 | if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')') | ||
203 | ParseERE(')'); | ||
204 | VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize()); | ||
205 | m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno)); | ||
206 | if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){ | ||
207 | TRACE0("RE: No matching ')'\n"); | ||
208 | if(!m_Error) | ||
209 | m_Error = regeParen; | ||
210 | // ??? point to nuls? | ||
211 | } | ||
212 | break; | ||
213 | case '^': | ||
214 | m_Error || m_Strip.Add(CSop(CSop::opBOL)); | ||
215 | m_iFlags|=iflagsUseBOL; | ||
216 | m_BOLs++; | ||
217 | wascaret=TRUE; | ||
218 | break; | ||
219 | case '$': | ||
220 | m_Error || m_Strip.Add(CSop(CSop::opEOL)); | ||
221 | m_iFlags|=iflagsUseEOL; | ||
222 | m_EOLs++; | ||
223 | break; | ||
224 | case '|': | ||
225 | TRACE0("RE: '|' outside of expression\n"); | ||
226 | if(!m_Error) | ||
227 | m_Error = regeEmpty; | ||
228 | // ??? point to nuls? | ||
229 | break; | ||
230 | case '*': | ||
231 | case '+': | ||
232 | case '?': | ||
233 | TRACE0("RE: '*'/'+'/'?' with no previous expression\n"); | ||
234 | if(!m_Error) | ||
235 | m_Error = regeBadRepeat; | ||
236 | // ??? point to nuls? | ||
237 | break; | ||
238 | case '.': | ||
239 | if(m_Flags®NewLine) | ||
240 | EmitNonNewLineAny(); | ||
241 | else | ||
242 | m_Error || m_Strip.Add(CSop(CSop::opAny)); | ||
243 | break; | ||
244 | case '[': | ||
245 | ParseBracket(); | ||
246 | break; | ||
247 | case '\\': | ||
248 | if(m_ParsePointer >= m_Pattern.GetLength()){ | ||
249 | TRACE0("RE: '\\' at the end of the pattern\n"); | ||
250 | if(!m_Error) | ||
251 | m_Error = regeEscape; | ||
252 | // ??? point to nuls? | ||
253 | }else{ | ||
254 | c = m_Pattern[m_ParsePointer++]; | ||
255 | EmitOrdinary(c); | ||
256 | } | ||
257 | break; | ||
258 | case '{': | ||
259 | if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){ | ||
260 | TRACE0("RE: '{' with no repeat count\n"); | ||
261 | if(!m_Error) | ||
262 | m_Error = regeBadRepeat; | ||
263 | // ??? point to nuls? | ||
264 | } | ||
265 | // Fallthrough.. | ||
266 | default: | ||
267 | EmitOrdinary(c); | ||
268 | break; | ||
269 | } | ||
270 | if(m_ParsePointer >= m_Pattern.GetLength()) | ||
271 | return; | ||
272 | c = m_Pattern[m_ParsePointer]; | ||
273 | // Call a '{' repetition if followed by a digit | ||
274 | if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) )) | ||
275 | return; // No repetitor - done. | ||
276 | m_ParsePointer++; | ||
277 | if(wascaret){ | ||
278 | TRACE0("RE: repetitive '^' detected\n"); | ||
279 | if(!m_Error) | ||
280 | m_Error = regeBadRepeat; | ||
281 | // ??? point to nuls? | ||
282 | } | ||
283 | switch(c){ | ||
284 | case '*':// Implemeted as +? | ||
285 | // + expression | ||
286 | StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); | ||
287 | m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); | ||
288 | // ? expression | ||
289 | StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1)); | ||
290 | m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos)); | ||
291 | break; | ||
292 | case '+': | ||
293 | // + expression | ||
294 | StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); | ||
295 | m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); | ||
296 | break; | ||
297 | case '?': | ||
298 | // Kludge - emit y? as (y|) until subtle bug gets fixed :-) | ||
299 | StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1)); | ||
300 | m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos)); | ||
301 | m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos); | ||
302 | m_Error || m_Strip.Add(CSop(CSop::opOr1,1)); | ||
303 | m_Error || m_Strip.Add(CSop(CSop::opChoice1,2)); | ||
304 | break; | ||
305 | case '{': | ||
306 | count = ParseCount(); | ||
307 | if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){ | ||
308 | m_ParsePointer++; | ||
309 | if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available | ||
310 | count2=ParseCount(); | ||
311 | if(!(count<=count2)){ | ||
312 | TRACE0("RE: Disbalanced counts in '{}' repeater\n"); | ||
313 | m_Error || (m_Error=regeBadBrace); | ||
314 | // ??? point to nuls? | ||
315 | } | ||
316 | }else // Single number with comma | ||
317 | count2=256; // Infinity | ||
318 | }else // Single number | ||
319 | count2=count; | ||
320 | EmitRepeat(pos,count,count2); | ||
321 | if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){ | ||
322 | // No '}'.. | ||
323 | TRACE0("RE: No immediately following '}' of '{' expression\n"); | ||
324 | while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}') | ||
325 | m_ParsePointer++; | ||
326 | if(m_ParsePointer >= m_Pattern.GetLength()){ | ||
327 | TRACE0("RE: No closing '}' found\n"); | ||
328 | m_Error || (m_Error=regeBrace); | ||
329 | }else | ||
330 | m_Error || (m_Error=regeBadBrace); | ||
331 | // ??? point to nuls? | ||
332 | }else | ||
333 | m_ParsePointer++; | ||
334 | break; | ||
335 | } | ||
336 | if(m_ParsePointer >= m_Pattern.GetLength()) | ||
337 | return; | ||
338 | c = m_Pattern[m_ParsePointer]; | ||
339 | if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])))) | ||
340 | return; | ||
341 | TRACE0("RE: Double repeater\n"); | ||
342 | m_Error || (m_Error=regeBadRepeat); | ||
343 | // ??? point to nuls? | ||
344 | } | ||
345 | |||
346 | void CRegEx::StripInsert(int pos,CSop& sop) | ||
347 | { | ||
348 | if(m_Error) | ||
349 | return; | ||
350 | int sn = m_Strip.GetSize(); | ||
351 | m_Strip.InsertAt(pos,sop); | ||
352 | for(int tmp=1;tmp<m_ParseParens.GetSize();tmp++){ | ||
353 | if(m_ParseParens[tmp].m_Begin>=pos) | ||
354 | m_ParseParens[tmp].m_Begin++; | ||
355 | if(m_ParseParens[tmp].m_End>=pos) | ||
356 | m_ParseParens[tmp].m_End++; | ||
357 | } | ||
358 | } | ||
359 | |||
360 | void CRegEx::EmitOrdinary(UCHAR c) | ||
361 | { | ||
362 | if(m_Flags®IgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){ | ||
363 | // Emit both cases | ||
364 | CString savePattern = m_Pattern; | ||
365 | int savePointer = m_ParsePointer; | ||
366 | m_Pattern=c; | ||
367 | m_Pattern+=']'; | ||
368 | m_ParsePointer=0; | ||
369 | ParseBracket(); | ||
370 | m_Pattern=savePattern; | ||
371 | m_ParsePointer=savePointer; | ||
372 | }else{ | ||
373 | m_Error || m_Strip.Add(CSop(CSop::opChar,c)); | ||
374 | if(!m_Category[(BYTE)c]) | ||
375 | m_Category[(BYTE)c]=m_Categories++; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | void CRegEx::EmitNonNewLineAny() | ||
380 | { | ||
381 | // Kludges're going on and on.. | ||
382 | CString savePattern = m_Pattern; | ||
383 | int savePointer = m_ParsePointer; | ||
384 | m_Pattern="^\n]"; | ||
385 | m_ParsePointer=0; | ||
386 | ParseBracket(); | ||
387 | m_Pattern=savePattern; | ||
388 | m_ParsePointer=savePointer; | ||
389 | } | ||
390 | |||
391 | int CRegEx::ParseCount() | ||
392 | { | ||
393 | BOOL nonEmpty=FALSE; | ||
394 | int rv = 0; | ||
395 | UCHAR c; | ||
396 | while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){ | ||
397 | rv = rv*10 + c-'0'; | ||
398 | nonEmpty=TRUE; | ||
399 | m_ParsePointer++; | ||
400 | } | ||
401 | if(rv>255 || !nonEmpty){ | ||
402 | m_Error || (m_Error=regeBadBrace); | ||
403 | // ??? point to nuls? | ||
404 | } | ||
405 | return rv; | ||
406 | } | ||
407 | |||
408 | void CRegEx::ParseBracket() | ||
409 | { | ||
410 | // Dept. of truly sickening special case kludges | ||
411 | if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){ | ||
412 | m_Error || m_Strip.Add(CSop(CSop::opBOW)); | ||
413 | m_ParsePointer+=6; | ||
414 | return; | ||
415 | } | ||
416 | if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){ | ||
417 | m_Error || m_Strip.Add(CSop(CSop::opEOW)); | ||
418 | m_ParsePointer+=6; | ||
419 | return; | ||
420 | } | ||
421 | BOOL invert=TRUE; | ||
422 | if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^') | ||
423 | m_ParsePointer++; | ||
424 | else | ||
425 | invert=FALSE; | ||
426 | CSet cset; | ||
427 | if(m_ParsePointer < m_Pattern.GetLength()){ | ||
428 | switch(m_Pattern[m_ParsePointer]){ | ||
429 | case ']': | ||
430 | case '-': | ||
431 | cset.Add(m_Pattern[m_ParsePointer]); | ||
432 | m_ParsePointer++; | ||
433 | break; | ||
434 | } | ||
435 | } | ||
436 | while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("-]"))) | ||
437 | ParseBracketTerm(cset); | ||
438 | if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){ | ||
439 | m_ParsePointer++; | ||
440 | cset.Add('-'); | ||
441 | } | ||
442 | if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==']') | ||
443 | m_ParsePointer++; | ||
444 | else{ | ||
445 | m_Error || (m_Error=regeBracket); | ||
446 | // ??? point to nuls? | ||
447 | return; | ||
448 | } | ||
449 | if(m_Flags®IgnoreCase){ | ||
450 | for(int tmp=CSet::size-1;tmp>=0;tmp--){ | ||
451 | if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp))) | ||
452 | cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp)); | ||
453 | } | ||
454 | /* | ||
455 | if(!cset->m_Multis.IsEmpty()) | ||
456 | cset.CollatingCase(); | ||
457 | */ | ||
458 | } | ||
459 | if(invert){ | ||
460 | for(int tmp=CSet::size-1;tmp>=0;tmp--) | ||
461 | if(cset.IsIn(tmp)) | ||
462 | cset.Sub(tmp); | ||
463 | else | ||
464 | cset.Add(tmp); | ||
465 | if(m_Flags®NewLine) | ||
466 | cset.Sub('\n'); | ||
467 | /* | ||
468 | if(!cset.m_Multis.IsEmpty()) | ||
469 | cset.CollatingInvert(); | ||
470 | */ | ||
471 | } | ||
472 | UCHAR c = cset.GetOnly(); | ||
473 | if(c){ | ||
474 | EmitOrdinary(c); | ||
475 | }else | ||
476 | m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset))); | ||
477 | } | ||
478 | |||
479 | void CRegEx::CSet::Add(UCHAR c) | ||
480 | { | ||
481 | m_Set[(BYTE)c]=TRUE; | ||
482 | m_Hash+=c; | ||
483 | } | ||
484 | |||
485 | BOOL CRegEx::CSet::IsIn(UCHAR c) | ||
486 | { | ||
487 | return m_Set[(BYTE)c]; | ||
488 | } | ||
489 | |||
490 | void CRegEx::CSet::Sub(UCHAR c) | ||
491 | { | ||
492 | m_Set[(BYTE)c]=FALSE; | ||
493 | m_Hash-=c; | ||
494 | } | ||
495 | |||
496 | UCHAR CRegEx::CSet::GetOnly() | ||
497 | { | ||
498 | int rv = 0; | ||
499 | UCHAR only = 0; | ||
500 | for(int tmp=0;tmp<size;tmp++){ | ||
501 | rv+=m_Set[tmp]?(only=tmp,1):0; | ||
502 | } | ||
503 | return (rv==1)?only:0; | ||
504 | } | ||
505 | |||
506 | int CRegEx::StoreSet(CSet& cset) | ||
507 | { | ||
508 | for(int tmp=0;tmp<m_Sets.GetSize();tmp++) | ||
509 | if(m_Sets[tmp]==cset) | ||
510 | return tmp; | ||
511 | return m_Sets.Add(cset); | ||
512 | } | ||
513 | |||
514 | void CRegEx::ParseBracketTerm(CSet& cset) | ||
515 | { | ||
516 | UCHAR c; | ||
517 | switch((m_ParsePointer<m_Pattern.GetLength())?m_Pattern[m_ParsePointer]:0){ | ||
518 | case '[': | ||
519 | c = ((m_ParsePointer+1)<m_Pattern.GetLength())?m_Pattern[m_ParsePointer+1]:0; | ||
520 | break; | ||
521 | case '-': | ||
522 | m_Error || (m_Error=regeRange); | ||
523 | // ??? point to nuls? | ||
524 | return; | ||
525 | default: | ||
526 | c = 0; | ||
527 | break; | ||
528 | } | ||
529 | switch(c){ | ||
530 | case ':':// Character class | ||
531 | m_ParsePointer+=2; | ||
532 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
533 | m_Error || (m_Error=regeBracket); | ||
534 | // ??? point to nuls? | ||
535 | } | ||
536 | c = m_Pattern[m_ParsePointer]; | ||
537 | if(c== '-' || c==']'){ | ||
538 | m_Error || (m_Error=regeCType); | ||
539 | // ??? point to nuls? | ||
540 | } | ||
541 | ParseBracketCClass(cset); | ||
542 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
543 | m_Error || (m_Error=regeBracket); | ||
544 | // ??? point to nuls? | ||
545 | } | ||
546 | if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){ | ||
547 | m_Error || (m_Error=regeCType); | ||
548 | // ??? point to nuls? | ||
549 | }else | ||
550 | m_ParsePointer+=2; | ||
551 | break; | ||
552 | case '=':// Equivalence class | ||
553 | m_ParsePointer+=2; | ||
554 | if(m_ParsePointer >= m_Pattern.GetLength()){ | ||
555 | m_Error || (m_Error=regeBracket); | ||
556 | // ??? point to nuls? | ||
557 | } | ||
558 | c = m_Pattern[m_ParsePointer]; | ||
559 | if(c== '-' || c==']'){ | ||
560 | m_Error || (m_Error=regeCollate); | ||
561 | // ??? point to nuls? | ||
562 | } | ||
563 | ParseBracketEClass(cset); | ||
564 | if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){ | ||
565 | m_Error || (m_Error=regeCollate); | ||
566 | // ??? point to nuls? | ||
567 | }else | ||
568 | m_ParsePointer+=2; | ||
569 | break; | ||
570 | default:// Symbol, character or range | ||
571 | { | ||
572 | UCHAR start, finish; | ||
573 | start = ParseBracketSymbol(); | ||
574 | if((m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-') /*&& (m_ParsePointer+1)<m_Pattern.GetLength() && m_Pattern[m_ParsePointer+1]==']'*/){ | ||
575 | // I believe the expression above is seetwo.. | ||
576 | // range. | ||
577 | m_ParsePointer++; | ||
578 | if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){ | ||
579 | m_ParsePointer++; | ||
580 | finish='-'; | ||
581 | }else | ||
582 | finish=ParseBracketSymbol(); | ||
583 | }else | ||
584 | finish=start; | ||
585 | if(((BYTE)start)>((BYTE)finish)){ | ||
586 | m_Error || (m_Error=regeRange); | ||
587 | // ??? point to nuls? | ||
588 | } | ||
589 | for(int tmp=start;tmp<=(BYTE)finish;tmp++) | ||
590 | cset.Add(tmp); | ||
591 | } | ||
592 | break; | ||
593 | } | ||
594 | } | ||
595 | |||
596 | void CRegEx::ParseBracketCClass(CSet& cset) | ||
597 | { | ||
598 | static struct{ | ||
599 | char *className; | ||
600 | char *classChars; | ||
601 | }cc[] = { | ||
602 | {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"}, | ||
603 | {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"}, | ||
604 | {"blank"," \t"}, | ||
605 | {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"}, | ||
606 | {"digit","0123456789"}, | ||
607 | {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, | ||
608 | {"lower","abcdefghijklmnopqrstuvwxyz"}, | ||
609 | {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "}, | ||
610 | {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, | ||
611 | {"space","\t\n\v\f\r "}, | ||
612 | {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, | ||
613 | {"xdigit","0123456789ABCDEFabcdef"} | ||
614 | }; | ||
615 | CString cclass; | ||
616 | UCHAR c; | ||
617 | while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){ | ||
618 | cclass+=c; | ||
619 | m_ParsePointer++; | ||
620 | } | ||
621 | char *classChars = NULL; | ||
622 | for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){ | ||
623 | if(!cclass.CompareNoCase(cc[tmp].className)){ | ||
624 | classChars=cc[tmp].classChars; | ||
625 | break; | ||
626 | } | ||
627 | } | ||
628 | if(!classChars){ | ||
629 | m_Error || (m_Error=regeCType); | ||
630 | // ??? point to nuls? | ||
631 | return; | ||
632 | } | ||
633 | while(*classChars) | ||
634 | cset.Add(*(classChars++)); | ||
635 | // --- multis | ||
636 | } | ||
637 | |||
638 | void CRegEx::ParseBracketEClass(CSet& cset) | ||
639 | { | ||
640 | cset.Add(ParseBracketCollatingElement('='));; | ||
641 | } | ||
642 | |||
643 | UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term) | ||
644 | { | ||
645 | static struct{ | ||
646 | char *entityName; | ||
647 | char entity; | ||
648 | }cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} }; | ||
649 | CString seeTwo; | ||
650 | seeTwo=term; | ||
651 | seeTwo+=']'; | ||
652 | CString entityName; | ||
653 | while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(seeTwo))) | ||
654 | entityName+=m_Pattern[m_ParsePointer++]; | ||
655 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
656 | m_Error || (m_Error=regeBracket); | ||
657 | // ??? point to nuls? | ||
658 | return 0; | ||
659 | } | ||
660 | for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++) | ||
661 | if(!entityName.CompareNoCase(cc[tmp].entityName)) | ||
662 | return cc[tmp].entity; | ||
663 | if(entityName.GetLength()==1) | ||
664 | return entityName[0]; | ||
665 | m_Error || (m_Error=regeCollate); | ||
666 | // ??? point to nuls? | ||
667 | return 0; | ||
668 | } | ||
669 | |||
670 | UCHAR CRegEx::ParseBracketSymbol() | ||
671 | { | ||
672 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
673 | m_Error || (m_Error=regeBracket); | ||
674 | // ??? point to nuls? | ||
675 | } | ||
676 | if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[.")) | ||
677 | m_ParsePointer+=2; | ||
678 | else | ||
679 | return m_Pattern[m_ParsePointer++]; | ||
680 | // Collating symbol | ||
681 | UCHAR rv = ParseBracketCollatingElement('.'); | ||
682 | if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[.")) | ||
683 | m_ParsePointer+=2; | ||
684 | else{ | ||
685 | m_Error || (m_Error=regeCollate); | ||
686 | // ??? point to nuls? | ||
687 | } | ||
688 | return rv; | ||
689 | } | ||
690 | |||
691 | void CRegEx::EmitRepeat(int pos,int from,int to) | ||
692 | { | ||
693 | if(m_Error) | ||
694 | return; | ||
695 | ASSERT(from<=to); | ||
696 | int finish = m_Strip.GetSize(); | ||
697 | int copy; | ||
698 | #defineN 2 | ||
699 | #define INF 3 | ||
700 | #define REP(f,t) ((f)*8+(t)) | ||
701 | #define MAP(n) (((n)<=1)?(n):((n)==256)?INF:N) | ||
702 | switch(REP(MAP(from),MAP(to))){ | ||
703 | case REP(0,0):// must be user doing ths?? | ||
704 | m_Strip.SetSize(pos); | ||
705 | break; | ||
706 | case REP(0,1):// as in '?' | ||
707 | case REP(0,N):// as in '{1,n}?' | ||
708 | case REP(0,INF):// as in '{1,}?' | ||
709 | // Kludge - emit y? as (y|) until something gets fixed.. | ||
710 | StripInsert(pos,CSop(CSop::opChoice0,pos)); | ||
711 | EmitRepeat(pos+1,1,to); | ||
712 | m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos)); | ||
713 | m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos); | ||
714 | m_Error || m_Strip.Add(CSop(CSop::opOr1,1)); | ||
715 | m_Error || m_Strip.Add(CSop(CSop::opChoice1,2)); | ||
716 | break; | ||
717 | case REP(1,1): | ||
718 | break; | ||
719 | case REP(1,N):// as in 'x?x{1,n-1}' | ||
720 | // Kludge again.. | ||
721 | StripInsert(pos,CSop(CSop::opChoice0,pos)); | ||
722 | m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos)); | ||
723 | m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos); | ||
724 | m_Error || m_Strip.Add(CSop(CSop::opOr1,1)); | ||
725 | m_Error || m_Strip.Add(CSop(CSop::opChoice1,2)); | ||
726 | copy = StripDuplicate(pos+1,finish+1); | ||
727 | ASSERT(copy==(finish+4)); | ||
728 | EmitRepeat(copy,1,to-1); | ||
729 | break; | ||
730 | case REP(1,INF): // as in '+' | ||
731 | StripInsert(pos,CSop(CSop::opPlus0,pos)); | ||
732 | m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); | ||
733 | break; | ||
734 | case REP(N,N):// as in 'xx{from-1,to-1}' | ||
735 | copy = StripDuplicate(pos,finish); | ||
736 | EmitRepeat(copy,from-1,to-1); | ||
737 | break; | ||
738 | case REP(N,INF): // as in 'xx{n-1,}' | ||
739 | copy = StripDuplicate(pos,finish); | ||
740 | EmitRepeat(copy,from-1,to); | ||
741 | break; | ||
742 | #ifndef NDEBUG | ||
743 | default: | ||
744 | ASSERT(FALSE); | ||
745 | break; | ||
746 | #endif | ||
747 | } | ||
748 | #undef MAP | ||
749 | #undef REP | ||
750 | #undef INF | ||
751 | #undef N | ||
752 | } | ||
753 | |||
754 | int CRegEx::StripDuplicate(int from,int to) | ||
755 | { | ||
756 | int rv = m_Strip.GetSize(); | ||
757 | ASSERT(from<=to); | ||
758 | if(from==to) | ||
759 | return rv; | ||
760 | // Maybe should be optimized for copying the whole thing. | ||
761 | for(int tmp=from;tmp<to;tmp++) | ||
762 | m_Strip.Add(m_Strip[tmp]); | ||
763 | return rv; | ||
764 | } | ||
765 | |||
766 | void CRegEx::Categorize() | ||
767 | { | ||
768 | if(m_Error) | ||
769 | return; | ||
770 | for(int tmp=0;tmp<(sizeof(m_Category)/sizeof(m_Category[0]));tmp++) | ||
771 | if((!m_Category[tmp]) && IsInSets(tmp)){ | ||
772 | int cat = m_Categories++; | ||
773 | m_Category[tmp]=cat; | ||
774 | for(int c=tmp+1;c<(sizeof(m_Category)/sizeof(m_Category[0]));c++) | ||
775 | if((!m_Category[c]) && IsInSameSets(tmp,c)) | ||
776 | m_Category[c]=cat; | ||
777 | } | ||
778 | } | ||
779 | |||
780 | BOOL CRegEx::IsInSets(UCHAR c) | ||
781 | { | ||
782 | for(int tmp=0;tmp<m_Sets.GetSize();tmp++) | ||
783 | if(m_Sets[tmp].IsIn(c)) | ||
784 | return TRUE; | ||
785 | return FALSE; | ||
786 | } | ||
787 | |||
788 | BOOL CRegEx::IsInSameSets(UCHAR c1,UCHAR c2) | ||
789 | { | ||
790 | for(int tmp=0;tmp<m_Sets.GetSize();tmp++) | ||
791 | if(m_Sets[tmp].IsIn(c1)!=m_Sets[tmp].IsIn(c2)) | ||
792 | return FALSE; | ||
793 | return TRUE; | ||
794 | } | ||
795 | |||
796 | void CRegEx::FigureMust() | ||
797 | { | ||
798 | if(m_Error) | ||
799 | return; | ||
800 | m_Must.Empty(); | ||
801 | int stripLen = m_Strip.GetSize(); | ||
802 | int seqStart, seqLength = 0; | ||
803 | int mustStart, mustLength = 0; | ||
804 | for(int tmp=1;tmp<stripLen;tmp++){ | ||
805 | switch(m_Strip[tmp].m_Operator){ | ||
806 | case CSop::opChar: | ||
807 | if(!seqLength) | ||
808 | seqStart=tmp; | ||
809 | seqLength++; | ||
810 | break; | ||
811 | case CSop::opPlus0: | ||
812 | case CSop::opLeftParen: | ||
813 | case CSop::opRightParen: | ||
814 | break;// Break, since they don't break the sequence | ||
815 | case CSop::opQuest0: | ||
816 | case CSop::opChoice0: | ||
817 | // These ones we skip. | ||
818 | do{ | ||
819 | tmp+=m_Strip[tmp].m_Operand; | ||
820 | // I still think it could be ASSERTed.. | ||
821 | if(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1 && m_Strip[tmp].m_Operator!=CSop::opOr1){ | ||
822 | m_iFlags|=iflagsBad; | ||
823 | return; | ||
824 | } | ||
825 | }while(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1); | ||
826 | // Fallthrough.. | ||
827 | default: | ||
828 | // End of sequence | ||
829 | if(seqLength>mustLength){ | ||
830 | mustStart=seqStart; | ||
831 | mustLength=seqLength; | ||
832 | } | ||
833 | seqLength=0; | ||
834 | break; | ||
835 | } | ||
836 | }// Hmm.. originally it's meant to be do while not opEnd.. | ||
837 | if(!mustLength) | ||
838 | return; | ||
839 | // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not.. | ||
840 | for(tmp=0;tmp<seqLength;tmp++){ | ||
841 | while(m_Strip[seqStart+tmp].m_Operator!=CSop::opChar) | ||
842 | ASSERT(tmp<seqLength); | ||
843 | m_Must+=m_Strip[tmp].m_Operand; | ||
844 | } | ||
845 | } | ||
846 | |||
847 | int CRegEx::CountPluses() | ||
848 | { | ||
849 | if(m_Error) | ||
850 | return 0; | ||
851 | int stripLen = m_Strip.GetSize(); | ||
852 | int rv = 0; | ||
853 | int nest = 0; | ||
854 | for(int tmp=0;tmp<stripLen;tmp++){ | ||
855 | switch(m_Strip[tmp].m_Operator){ | ||
856 | case CSop::opPlus0: | ||
857 | nest++; | ||
858 | break; | ||
859 | case CSop::opPlus1: | ||
860 | if(nest>rv) | ||
861 | rv = nest; | ||
862 | nest--; | ||
863 | break; | ||
864 | } | ||
865 | }// Again, originally we were supposed to scan till opEnd.. | ||
866 | if(nest) | ||
867 | m_iFlags|=iflagsBad;// Could this be an ASSERTion? | ||
868 | return rv; | ||
869 | } | ||
870 | |||
871 | void CRegEx::ParseLiteral() | ||
872 | { | ||
873 | if(!m_Pattern.GetLength()){ | ||
874 | m_Error || (m_Error=regeEmpty); | ||
875 | // ??? point to nuls? | ||
876 | } | ||
877 | while(m_ParsePointer < m_Pattern.GetLength()) | ||
878 | EmitOrdinary(m_Pattern[m_ParsePointer++]); | ||
879 | } | ||
880 | |||
881 | void CRegEx::ParseBRE(int stopa,int stopb) | ||
882 | { | ||
883 | int start = m_Strip.GetSize(); | ||
884 | BOOL first=TRUE; | ||
885 | BOOL wasDollar=FALSE; | ||
886 | if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^'){ | ||
887 | m_ParsePointer++; | ||
888 | m_Error || m_Strip.Add(CSop(CSop::opBOL,0)); | ||
889 | m_iFlags|=iflagsUseBOL; | ||
890 | m_BOLs++; | ||
891 | } | ||
892 | CString stopstr; | ||
893 | if(stopa){ | ||
894 | stopstr+=stopa; | ||
895 | if(stopb) | ||
896 | stopstr+=stopb; | ||
897 | } | ||
898 | while(m_ParsePointer < m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(stopstr))){ | ||
899 | wasDollar = ParseBREexp(first); | ||
900 | first=FALSE; | ||
901 | } | ||
902 | if(wasDollar){// Trailing anchor that was.. | ||
903 | m_Strip.SetSize(m_Strip.GetSize()-1); | ||
904 | m_Error || m_Strip.Add(CSop(CSop::opEOL,0)); | ||
905 | m_iFlags|=iflagsUseEOL; | ||
906 | m_EOLs++; | ||
907 | } | ||
908 | if(m_Strip.GetSize()==start){ | ||
909 | m_Error || (m_Error=regeEmpty); | ||
910 | // ??? point to nuls? | ||
911 | } | ||
912 | } | ||
913 | |||
914 | BOOL CRegEx::ParseBREexp(BOOL ordinaryStar) | ||
915 | { | ||
916 | int subno; | ||
917 | int pos = m_Strip.GetSize(); | ||
918 | ASSERT(m_ParsePointer<m_Pattern.GetLength()); | ||
919 | int c = m_Pattern[m_ParsePointer++]; | ||
920 | if(c=='\\'){ | ||
921 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
922 | m_Error || (m_Error=regeEscape); | ||
923 | // ??? point to nuls | ||
924 | }else | ||
925 | c = 0x100|m_Pattern[m_ParsePointer++]; | ||
926 | } | ||
927 | switch(c){ | ||
928 | case '.': | ||
929 | if(m_Flags®NewLine) | ||
930 | EmitNonNewLineAny(); | ||
931 | else | ||
932 | m_Error || m_Strip.Add(CSop(CSop::opAny,0)); | ||
933 | break; | ||
934 | case '[': | ||
935 | ParseBracket(); | ||
936 | break; | ||
937 | case 0x100|'{': | ||
938 | m_Error || (m_Error=regeBadRepeat); | ||
939 | // ??? point to nuls? | ||
940 | break; | ||
941 | case 0x100|'(': | ||
942 | m_Subexps++; | ||
943 | subno=m_Subexps; | ||
944 | m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize())); | ||
945 | m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno)); | ||
946 | if(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)"))) | ||
947 | ParseBRE('\\',')'); | ||
948 | VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize()); | ||
949 | m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno)); | ||
950 | if((m_ParsePointer+1) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)")) | ||
951 | m_ParsePointer+=2; | ||
952 | else{ | ||
953 | m_Error || (m_Error=regeParen); | ||
954 | // ??? point to nuls? | ||
955 | } | ||
956 | break; | ||
957 | case 0x100|')': | ||
958 | case 0x100|'}': | ||
959 | // Can this possibly happen?! | ||
960 | m_Error || (m_Error=regeParen); | ||
961 | // ??? point to nuls? | ||
962 | break; | ||
963 | case 0x100|'1': | ||
964 | case 0x100|'2': | ||
965 | case 0x100|'3': | ||
966 | case 0x100|'4': | ||
967 | case 0x100|'5': | ||
968 | case 0x100|'6': | ||
969 | case 0x100|'7': | ||
970 | case 0x100|'8': | ||
971 | case 0x100|'9': | ||
972 | { | ||
973 | int i = (c&0xFF)-'0'; | ||
974 | if(i < m_ParseParens.GetSize() && m_ParseParens[i].m_End){ | ||
975 | m_Error || m_Strip.Add(CSop(CSop::opBackRef0,i)); | ||
976 | ASSERT(m_ParseParens[i].m_Begin); | ||
977 | ASSERT(m_Strip[m_ParseParens[i].m_Begin].m_Operator==CSop::opLeftParen); | ||
978 | ASSERT(m_Strip[m_ParseParens[i].m_End].m_Operator==CSop::opRightParen); | ||
979 | StripDuplicate(m_ParseParens[i].m_Begin+1,m_ParseParens[i].m_End); | ||
980 | m_Error || m_Strip.Add(CSop(CSop::opBackRef1,i)); | ||
981 | }else{ | ||
982 | m_Error || (m_Error=regeSubReg); | ||
983 | // ??? point to nuls? | ||
984 | } | ||
985 | m_bBackRefs=TRUE; | ||
986 | } | ||
987 | break; | ||
988 | case '*': | ||
989 | if(!ordinaryStar){ | ||
990 | m_Error || (m_Error=regeBadRepeat); | ||
991 | // ??? point to nuls? | ||
992 | } | ||
993 | // Fallthrough.. | ||
994 | default: | ||
995 | EmitOrdinary(c&0xFF); | ||
996 | break; | ||
997 | } | ||
998 | if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='*'){ | ||
999 | m_ParsePointer++; | ||
1000 | // as in '+?' | ||
1001 | StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); | ||
1002 | m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); | ||
1003 | StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1)); | ||
1004 | m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos)); | ||
1005 | }else if ((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\{")){ | ||
1006 | m_ParsePointer+=2; | ||
1007 | int count = ParseCount(); | ||
1008 | int count2; | ||
1009 | if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){ | ||
1010 | m_ParsePointer++; | ||
1011 | if(m_ParsePointer<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer])){ | ||
1012 | count2=ParseCount(); | ||
1013 | if(count>count2){ | ||
1014 | m_Error || (m_Error=regeBadBrace); | ||
1015 | // ??? poin to nuls? | ||
1016 | } | ||
1017 | }else // Single number with comma | ||
1018 | count2=256; | ||
1019 | }else // Single number | ||
1020 | count2=count; | ||
1021 | EmitRepeat(pos,count,count2); | ||
1022 | if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){ | ||
1023 | while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\}"))) | ||
1024 | m_ParsePointer++; | ||
1025 | if(m_ParsePointer>=m_Pattern.GetLength()){ | ||
1026 | m_Error || (m_Error=regeBrace); | ||
1027 | // ??? point to nuls? | ||
1028 | } | ||
1029 | m_Error || (m_Error=regeBadBrace); | ||
1030 | }else | ||
1031 | m_ParsePointer+=2; | ||
1032 | }else if(c=='$') | ||
1033 | return TRUE; | ||
1034 | return FALSE; | ||
1035 | } | ||
1036 | |||
1037 | CRegEx::CRegEx() | ||
1038 | { | ||
1039 | m_bCompiled=FALSE; | ||
1040 | } | ||
1041 | |||
1042 | LPCTSTR CRegEx::MatchFast(LPCTSTR begin) | ||
1043 | { | ||
1044 | MatchStatesClear(CSop::stCurrent); | ||
1045 | m_Strip[1].m_MatchData|=CSop::stCurrent; | ||
1046 | int stripLen = m_Strip.GetSize(); | ||
1047 | MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent); | ||
1048 | MatchStatesCopy(CSop::stFresh,CSop::stCurrent); | ||
1049 | LPCTSTR coldp = NULL; | ||
1050 | LPCTSTR p = begin; | ||
1051 | int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]); | ||
1052 | for(;;){ | ||
1053 | // next character.. | ||
1054 | int lastc = c; | ||
1055 | c = (p==m_mEnd)?charOut:(int)*(BYTE*)p; | ||
1056 | if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh)) | ||
1057 | coldp=p; | ||
1058 | // Is there an EOL and/or BOL between lastc and c? - they ask.. | ||
1059 | intflagc=0; | ||
1060 | inti = 0; | ||
1061 | if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){ | ||
1062 | flagc=charBOL; | ||
1063 | i=m_BOLs; | ||
1064 | } | ||
1065 | if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){ | ||
1066 | flagc=(flagc==charBOL)?charBOLEOL:charEOL; | ||
1067 | i+=m_EOLs; | ||
1068 | } | ||
1069 | if(i){ | ||
1070 | for(;i>0;i--) | ||
1071 | MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); | ||
1072 | } | ||
1073 | // What about a word boundary? - they wonder.. | ||
1074 | if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c))) | ||
1075 | flagc = charBOW; | ||
1076 | if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) | ||
1077 | flagc = charEOW; | ||
1078 | if(flagc==charBOW || flagc==charEOW){ | ||
1079 | MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); | ||
1080 | } | ||
1081 | // Are we done? Now WE wonder.. | ||
1082 | if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd) | ||
1083 | break;// They insist I need to note break out.. Okay, I do.. | ||
1084 | // Nope, we're not done. We have to face this character.. | ||
1085 | MatchStatesCopy(CSop::stTemp,CSop::stCurrent); | ||
1086 | MatchStatesCopy(CSop::stCurrent,CSop::stFresh); | ||
1087 | ASSERT(c!=charOut); | ||
1088 | MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent); | ||
1089 | p++; | ||
1090 | } | ||
1091 | ASSERT(coldp); | ||
1092 | m_cOldP=coldp;// *** I believe this variable can be changed 'in-place' | ||
1093 | if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) | ||
1094 | return &p[1]; | ||
1095 | else | ||
1096 | return NULL; | ||
1097 | } | ||
1098 | |||
1099 | void CRegEx::MatchStatesClear(BYTE mask) | ||
1100 | { | ||
1101 | int stripLen = m_Strip.GetSize(); | ||
1102 | for(int tmp=0;tmp<stripLen;tmp++) | ||
1103 | m_Strip[tmp].m_MatchData&=~mask; | ||
1104 | } | ||
1105 | |||
1106 | void CRegEx::MatchStep(int from,int to,BYTE maskBefore,int charCode,BYTE maskAfter) | ||
1107 | { | ||
1108 | BOOL i; | ||
1109 | int look; | ||
1110 | int here = from; | ||
1111 | for(int pc=from;pc!=to;pc++,here++){ | ||
1112 | CSop s=m_Strip[pc]; | ||
1113 | switch(s.m_Operator){ | ||
1114 | case CSop::opEnd: | ||
1115 | ASSERT(pc==(to-1)); | ||
1116 | break; | ||
1117 | case CSop::opChar: | ||
1118 | // Only characters can match.. | ||
1119 | ASSERT((charCode<charOut) || charCode!=s.m_Operand); | ||
1120 | if(charCode==s.m_Operand) | ||
1121 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1122 | break; | ||
1123 | case CSop::opBOL: | ||
1124 | if(charCode==charBOL || charCode==charBOLEOL) | ||
1125 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1126 | break; | ||
1127 | case CSop::opEOL: | ||
1128 | if(charCode==charEOL || charCode==charBOLEOL) | ||
1129 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1130 | break; | ||
1131 | case CSop::opBOW: | ||
1132 | if(charCode==charBOW) | ||
1133 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1134 | break; | ||
1135 | case CSop::opEOW: | ||
1136 | if(charCode==charEOW) | ||
1137 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1138 | break; | ||
1139 | case CSop::opAny: | ||
1140 | if(charCode<charOut) | ||
1141 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1142 | break; | ||
1143 | case CSop::opAnyOf: | ||
1144 | if(charCode<charOut && m_Sets[s.m_Operand].m_Set[charCode]) | ||
1145 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0; | ||
1146 | break; | ||
1147 | case CSop::opBackRef0:// Ignored here.. | ||
1148 | case CSop::opBackRef1: | ||
1149 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1150 | break; | ||
1151 | case CSop::opPlus0: | ||
1152 | // Forward, this is just an empty, comment says.. | ||
1153 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1154 | break; | ||
1155 | case CSop::opPlus1: | ||
1156 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1157 | i =(m_Strip[here-s.m_Operand].m_MatchData&maskAfter)!=0; | ||
1158 | m_Strip[here-s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1159 | if(!i && (m_Strip[here-s.m_Operand].m_MatchData&maskAfter)){ | ||
1160 | // oho, must reconsider loop body, comment says.. what's so 'oho' about it? | ||
1161 | pc-=s.m_Operand+1; | ||
1162 | here=pc; | ||
1163 | } | ||
1164 | break; | ||
1165 | case CSop::opQuest0: | ||
1166 | // two branches, both forward.. | ||
1167 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1168 | m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1169 | break; | ||
1170 | case CSop::opQuest1: | ||
1171 | // just an empty.. aren't we tired of justanempties? | ||
1172 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1173 | break; | ||
1174 | case CSop::opLeftParen: // they say it's not significan there.. | ||
1175 | case CSop::opRightParen: | ||
1176 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1177 | break; | ||
1178 | case CSop::opChoice0:// mark the first two branches.. | ||
1179 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1180 | ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1); | ||
1181 | m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1182 | break; | ||
1183 | case CSop::opOr0:// done a branch, find the end of choice.. | ||
1184 | if(m_Strip[here].m_MatchData&maskAfter){ | ||
1185 | for(look=1;(s=m_Strip[pc+look]).m_Operator!=CSop::opChoice1;look+=s.m_Operand) | ||
1186 | ASSERT(s.m_Operator==CSop::opOr1); | ||
1187 | m_Strip[here+look].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1188 | } | ||
1189 | break; | ||
1190 | case CSop::opOr1: // Propagate Choice's marking.. | ||
1191 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1192 | if(m_Strip[pc+s.m_Operand].m_Operator!=CSop::opChoice1){ | ||
1193 | ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1); | ||
1194 | m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1195 | } | ||
1196 | break; | ||
1197 | case CSop::opChoice1: // Just empty.. :-).. | ||
1198 | m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0; | ||
1199 | break; | ||
1200 | #ifdef _DEBUG | ||
1201 | default: | ||
1202 | ASSERT(FALSE); | ||
1203 | break; | ||
1204 | #endif | ||
1205 | } | ||
1206 | } | ||
1207 | } | ||
1208 | |||
1209 | void CRegEx::MatchStatesCopy(BYTE dst,BYTE src) | ||
1210 | { | ||
1211 | int stripLen = m_Strip.GetSize(); | ||
1212 | for(int tmp=0;tmp<stripLen;tmp++){ | ||
1213 | // I believe this can be optimized, easily.. | ||
1214 | m_Strip[tmp].m_MatchData&=~dst; | ||
1215 | m_Strip[tmp].m_MatchData|=(m_Strip[tmp].m_MatchData&src)?dst:0; | ||
1216 | } | ||
1217 | } | ||
1218 | |||
1219 | BOOL CRegEx::MatchStatesEqual(BYTE m1,BYTE m2) | ||
1220 | { | ||
1221 | int stripLen = m_Strip.GetSize(); | ||
1222 | for(int tmp=0;tmp<stripLen;tmp++){ | ||
1223 | BYTE mm = m_Strip[tmp].m_MatchData; | ||
1224 | if(((mm&m1) && (mm&m2)) || !(mm&(m1|m2))) | ||
1225 | continue; | ||
1226 | return FALSE; | ||
1227 | } | ||
1228 | return TRUE; | ||
1229 | } | ||
1230 | |||
1231 | LPCTSTR CRegEx::MatchSlow(LPCTSTR begin,LPCTSTR end,int from,int to) | ||
1232 | { | ||
1233 | MatchStatesClear(CSop::stCurrent); | ||
1234 | m_Strip[from].m_MatchData|=CSop::stCurrent; | ||
1235 | MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent); | ||
1236 | LPCTSTR mp = NULL; | ||
1237 | int c = (m_mBegin==m_mPointer)?charOut:(int)(BYTE)begin[-1]; | ||
1238 | LPCTSTR p = begin; | ||
1239 | for(;;){ | ||
1240 | // next character.. | ||
1241 | int lastc = c; | ||
1242 | c = (p==m_mEnd)?charOut:(int)*(BYTE*)p; | ||
1243 | // Now we start to wonder if there is an EOL and/or BOL between lastc and c | ||
1244 | int flagc = 0; | ||
1245 | int i = 0; | ||
1246 | if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){ | ||
1247 | flagc = charBOL; | ||
1248 | i = m_BOLs; | ||
1249 | } | ||
1250 | if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){ | ||
1251 | flagc = (flagc==charBOL)?charBOLEOL:charEOL; | ||
1252 | i+=m_EOLs; | ||
1253 | } | ||
1254 | if(i){ | ||
1255 | for(;i>0;i--) | ||
1256 | MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); | ||
1257 | } | ||
1258 | // Now we wonder about word boundaries.. | ||
1259 | if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c))) | ||
1260 | flagc=charBOW; | ||
1261 | if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) | ||
1262 | flagc=charEOW; | ||
1263 | if(flagc==charBOW || flagc==charEOW){ | ||
1264 | MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); | ||
1265 | } | ||
1266 | // Are we done we all wonder?? | ||
1267 | if(m_Strip[to].m_MatchData&CSop::stCurrent) | ||
1268 | mp=p; | ||
1269 | if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end) | ||
1270 | break;// Again, we're obliged to note break out. We do. | ||
1271 | // Sucks.. we have to face this character.. | ||
1272 | MatchStatesCopy(CSop::stTemp,CSop::stCurrent); | ||
1273 | MatchStatesCopy(CSop::stCurrent,CSop::stEmpty); | ||
1274 | ASSERT(c!=charOut); | ||
1275 | MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent); | ||
1276 | MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent); | ||
1277 | p++; | ||
1278 | } | ||
1279 | return mp; | ||
1280 | } | ||
1281 | |||
1282 | LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to) | ||
1283 | { | ||
1284 | LPCTSTR sp = begin, dp; | ||
1285 | LPCTSTR stp, rest, tail, ssp, oldssp, sep; | ||
1286 | int ssub, esub; | ||
1287 | int es; | ||
1288 | int i; | ||
1289 | for(int ss=from;ss<to;ss = es){ | ||
1290 | // Identify end of SubRE | ||
1291 | es = ss; | ||
1292 | switch(m_Strip[es].m_Operator){ | ||
1293 | case CSop::opPlus0: | ||
1294 | case CSop::opQuest0: | ||
1295 | es+=m_Strip[es].m_Operand; | ||
1296 | break; | ||
1297 | case CSop::opChoice0: | ||
1298 | while(m_Strip[es].m_Operator!=CSop::opChoice1) | ||
1299 | es+=m_Strip[es].m_Operand; | ||
1300 | break; | ||
1301 | } | ||
1302 | es++; | ||
1303 | // Figure out what it matched | ||
1304 | switch(m_Strip[ss].m_Operator){ | ||
1305 | case CSop::opEnd: | ||
1306 | ASSERT(FALSE); | ||
1307 | break; | ||
1308 | case CSop::opChar: | ||
1309 | sp++; | ||
1310 | break; | ||
1311 | case CSop::opBOL: | ||
1312 | case CSop::opEOL: | ||
1313 | case CSop::opBOW: | ||
1314 | case CSop::opEOW: | ||
1315 | break; | ||
1316 | case CSop::opAny: | ||
1317 | case CSop::opAnyOf: | ||
1318 | sp++; | ||
1319 | break; | ||
1320 | case CSop::opBackRef0: | ||
1321 | case CSop::opBackRef1: | ||
1322 | ASSERT(FALSE); | ||
1323 | break; | ||
1324 | // Cases where lenght of match is hard to find.. | ||
1325 | case CSop::opQuest0: | ||
1326 | stp=end; | ||
1327 | for(;;){ | ||
1328 | // How long could this one be?? | ||
1329 | rest = MatchSlow(sp,stp,ss,es); | ||
1330 | ASSERT(rest);// It did match.. It should've.. | ||
1331 | // Could the rest match the rest? (good question) | ||
1332 | tail = MatchSlow(rest,end,es,to); | ||
1333 | if(tail==end) | ||
1334 | break;// yup. | ||
1335 | // nope, try a shorter match for this one.. | ||
1336 | stp=rest-1; | ||
1337 | ASSERT(stp>=sp);// It did work.. It should've.. | ||
1338 | } | ||
1339 | ssub=ss+1; | ||
1340 | esub=es-1; | ||
1341 | // Did innards match? | ||
1342 | if(MatchSlow(sp,rest,ssub,esub)){ | ||
1343 | dp = MatchDissect(sp,rest,ssub,esub); | ||
1344 | ASSERT(dp==rest); | ||
1345 | }else// nope.. | ||
1346 | ASSERT(sp==rest); | ||
1347 | sp = rest; | ||
1348 | break; | ||
1349 | case CSop::opPlus0: | ||
1350 | stp=end; | ||
1351 | for(;;){ | ||
1352 | // How long could this one be?? | ||
1353 | rest = MatchSlow(sp,stp,ss,es); | ||
1354 | ASSERT(rest);// It did.. It should've.. | ||
1355 | // Could the rest match the rest? | ||
1356 | tail = MatchSlow(rest,end,es,to); | ||
1357 | if(tail==end) | ||
1358 | break;// yup. | ||
1359 | // nope.. | ||
1360 | stp=rest-1; | ||
1361 | ASSERT(stp>=sp);// It should've worked, we believe.. | ||
1362 | } | ||
1363 | ssub=ss+1; | ||
1364 | esub=es-1; | ||
1365 | ssp=sp; | ||
1366 | oldssp=ssp; | ||
1367 | for(;;){// Find last match of innards.. | ||
1368 | sep = MatchSlow(ssp,rest,ssub,esub); | ||
1369 | if((!sep) || sep==ssp) | ||
1370 | break; // Failed or matched nothin' | ||
1371 | oldssp=ssp; | ||
1372 | ssp=sep; | ||
1373 | } | ||
1374 | if(!sep){ | ||
1375 | // Last successfull.. | ||
1376 | sep=ssp; | ||
1377 | ssp=oldssp; | ||
1378 | } | ||
1379 | ASSERT(sep=rest);// Must exhaust substring they say.. | ||
1380 | VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest);// Assert or verify - that is the question.. | ||
1381 | dp = MatchDissect(ssp,sep,ssub,esub); | ||
1382 | ASSERT(dp==sep); | ||
1383 | sp=rest; | ||
1384 | break; | ||
1385 | case CSop::opChoice0: | ||
1386 | stp = end; | ||
1387 | for(;;){ | ||
1388 | // how long.. | ||
1389 | rest = MatchSlow(sp,stp,ss,es); | ||
1390 | ASSERT(rest); | ||
1391 | // Could it.. | ||
1392 | tail = MatchSlow(rest,end,es,to); | ||
1393 | if(tail==end) | ||
1394 | break;// y | ||
1395 | // n | ||
1396 | stp = rest-1; | ||
1397 | ASSERT(stp>=sp); | ||
1398 | } | ||
1399 | ssub=ss+1; | ||
1400 | esub=ss+m_Strip[ss].m_Operand-1; | ||
1401 | ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); | ||
1402 | for(;;){// Find first matching branch.. | ||
1403 | if(MatchSlow(sp,rest,ssub,esub)==rest) | ||
1404 | break; | ||
1405 | // this one missed, try next.. | ||
1406 | ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); | ||
1407 | esub++; | ||
1408 | ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); | ||
1409 | ssub=esub+1; | ||
1410 | esub+=m_Strip[esub].m_Operand; | ||
1411 | if(m_Strip[esub].m_Operator==CSop::opOr1) | ||
1412 | esub--; | ||
1413 | else | ||
1414 | ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); | ||
1415 | } | ||
1416 | dp = MatchDissect(sp,rest,ssub,esub); | ||
1417 | ASSERT(dp==rest); | ||
1418 | sp=rest; | ||
1419 | break; | ||
1420 | case CSop::opPlus1: | ||
1421 | case CSop::opQuest1: | ||
1422 | case CSop::opOr0: | ||
1423 | case CSop::opOr1: | ||
1424 | case CSop::opChoice1: | ||
1425 | ASSERT(FALSE); | ||
1426 | break; | ||
1427 | case CSop::opLeftParen: | ||
1428 | i = m_Strip[ss].m_Operand; | ||
1429 | ASSERT(0<i && i<=m_Subexps); | ||
1430 | m_Matches[i].m_Begin=sp-m_mBegin; | ||
1431 | break; | ||
1432 | case CSop::opRightParen: | ||
1433 | i = m_Strip[ss].m_Operand; | ||
1434 | ASSERT(0<i && i<=m_Subexps); | ||
1435 | m_Matches[i].m_End=sp-m_mBegin; | ||
1436 | break; | ||
1437 | #ifdef _DEBUG | ||
1438 | default:// Uh.. oh.. | ||
1439 | ASSERT(FALSE); | ||
1440 | break; | ||
1441 | #endif | ||
1442 | } | ||
1443 | } | ||
1444 | ASSERT(sp==end); | ||
1445 | return sp; | ||
1446 | } | ||
1447 | |||
1448 | LPCTSTR CRegEx::MatchBackRef(LPCTSTR begin,LPCTSTR end,int from,int to,int level) | ||
1449 | { | ||
1450 | LPCTSTR sp = begin; | ||
1451 | BOOL hard = FALSE; | ||
1452 | // Get as far as we can as long as it's easy | ||
1453 | for(int ss=from;!hard && ss<to;ss++){ | ||
1454 | CSop s = m_Strip[ss]; | ||
1455 | switch(s.m_Operator){ | ||
1456 | case CSop::opChar: | ||
1457 | if(sp==end || *sp++ != s.m_Operand) | ||
1458 | return NULL; | ||
1459 | break; | ||
1460 | case CSop::opAny: | ||
1461 | if(sp==end) | ||
1462 | return NULL; | ||
1463 | sp++;// I'm sure this ++ could be embedded in previous expression.. | ||
1464 | break; | ||
1465 | case CSop::opAnyOf: | ||
1466 | if(sp==end || !m_Sets[s.m_Operand].IsIn(*sp++)) | ||
1467 | return NULL; | ||
1468 | break; | ||
1469 | case CSop::opBOL: | ||
1470 | if(!((sp==m_mBegin && !(m_mFlags®NotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags®NewLine)))) | ||
1471 | return NULL; | ||
1472 | break; | ||
1473 | case CSop::opEOL: | ||
1474 | if(!((sp==m_mEnd && !(m_mFlags®NotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags®NewLine)))) | ||
1475 | return NULL; | ||
1476 | break; | ||
1477 | case CSop::opBOW: | ||
1478 | if(!(((sp==m_mBegin && !(m_mFlags®NotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags®NewLine)) || (sp>m_mBegin && !isWordableChar(*(sp-1)))) && (sp<m_mEnd && isWordableChar(*sp)))) | ||
1479 | return NULL; | ||
1480 | break; | ||
1481 | case CSop::opEOW: | ||
1482 | if(!(((sp==m_mEnd && !(m_mFlags®NotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags®NewLine)) || (sp<m_mEnd && !isWordableChar(*sp))) && (sp>m_mBegin && isWordableChar(*(sp-1))))) | ||
1483 | return NULL; | ||
1484 | break; | ||
1485 | case CSop::opQuest1: | ||
1486 | break; | ||
1487 | case CSop::opOr0:// Matches null, but needs to skip | ||
1488 | ss++; | ||
1489 | s = m_Strip[ss]; | ||
1490 | do{ | ||
1491 | ASSERT(s.m_Operator==CSop::opOr1); | ||
1492 | ss+=s.m_Operand; | ||
1493 | }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1); | ||
1494 | // Now we should note that ss++ gets us past the Choice1.. | ||
1495 | break; | ||
1496 | default: | ||
1497 | // Have to make a choice.. | ||
1498 | hard=TRUE; | ||
1499 | break; | ||
1500 | } | ||
1501 | } | ||
1502 | if(!hard){// That was it.. | ||
1503 | if(sp!=end) | ||
1504 | return NULL; | ||
1505 | return sp; | ||
1506 | } | ||
1507 | ss--;// Adjust for ther for's final increment.. | ||
1508 | // Hard stuff.. is going on and on.. | ||
1509 | CSop s = m_Strip[ss]; | ||
1510 | int i, len, offsave; | ||
1511 | int ssub,esub; | ||
1512 | LPCTSTR ssp, dp; | ||
1513 | switch(s.m_Operator){ | ||
1514 | case CSop::opBackRef0:// The vilest depths they say.. If I only knew what the 'viles' stands for.. | ||
1515 | i = s.m_Operand; | ||
1516 | ASSERT(0<i && i<=m_Subexps); | ||
1517 | if(m_Matches[i].m_End<0) | ||
1518 | return NULL; | ||
1519 | ASSERT(m_Matches[i].m_Begin>=0); | ||
1520 | len = m_Matches[i].GetLength(); | ||
1521 | ASSERT((end-m_mBegin)>=len); | ||
1522 | if(sp>end-len) | ||
1523 | return NULL;// Not enough left to match.. | ||
1524 | ssp = m_mBegin+m_Matches[i].m_Begin; | ||
1525 | if(memcmp(sp,ssp,len)) | ||
1526 | return NULL; | ||
1527 | while(m_Strip[ss]!=CSop(CSop::opBackRef1,i)) | ||
1528 | ss++; | ||
1529 | return MatchBackRef(sp+len,end,ss+1,to,level-1); | ||
1530 | break; | ||
1531 | case CSop::opQuest0:// to null or not they wonder.. | ||
1532 | dp = MatchBackRef(sp,end,ss+1,to,level); | ||
1533 | if(dp) | ||
1534 | return dp;// not.. | ||
1535 | return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1); | ||
1536 | break; | ||
1537 | case CSop::opPlus0: | ||
1538 | ASSERT(m_mLastPos.GetSize()); | ||
1539 | ASSERT(level+1 <= m_Pluses); | ||
1540 | m_mLastPos[level+1]=sp; | ||
1541 | return MatchBackRef(sp,end,ss+1,to,level+1); | ||
1542 | break; | ||
1543 | case CSop::opPlus1: | ||
1544 | if(sp == m_mLastPos[level])// Last pass matched null | ||
1545 | return MatchBackRef(sp,end,ss+1,to,level-1); | ||
1546 | // Try another pass.. | ||
1547 | m_mLastPos[level]=sp; | ||
1548 | dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level); | ||
1549 | if(dp) | ||
1550 | return dp; | ||
1551 | return MatchBackRef(sp,end,ss+1,to,level-1); | ||
1552 | break; | ||
1553 | case CSop::opChoice0:// find the right one, ifany | ||
1554 | ssub = ss+1; | ||
1555 | esub = ss+s.m_Operand-1; | ||
1556 | ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); | ||
1557 | for(;;){// Find first matching branch. | ||
1558 | dp = MatchBackRef(sp,end,ssub,esub,level); | ||
1559 | if(dp) | ||
1560 | return dp; | ||
1561 | // This one missed, try next one.. | ||
1562 | if(m_Strip[esub].m_Operator==CSop::opChoice1) | ||
1563 | return NULL;// There is none.. | ||
1564 | esub++; | ||
1565 | ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); | ||
1566 | ssub=esub+1; | ||
1567 | esub+=m_Strip[esub].m_Operand; | ||
1568 | if(m_Strip[esub].m_Operator==CSop::opOr1) | ||
1569 | esub--; | ||
1570 | else | ||
1571 | ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); | ||
1572 | } | ||
1573 | break; | ||
1574 | case CSop::opLeftParen:// Must undo assignment if rest fails.. | ||
1575 | i = s.m_Operand; | ||
1576 | ASSERT(0<i && i<=m_Subexps); | ||
1577 | offsave = m_Matches[i].m_Begin; | ||
1578 | m_Matches[i].m_Begin = sp-m_mBegin; | ||
1579 | dp = MatchBackRef(sp,end,ss+1,to,level); | ||
1580 | if(dp) | ||
1581 | return dp; | ||
1582 | m_Matches[i].m_Begin=offsave; | ||
1583 | return NULL; | ||
1584 | break; | ||
1585 | case CSop::opRightParen: // Must undo assignment if rest fails.. | ||
1586 | i = s.m_Operand; | ||
1587 | ASSERT(0<i && i<=m_Subexps); | ||
1588 | offsave = m_Matches[i].m_End; | ||
1589 | m_Matches[i].m_End = sp-m_mBegin; | ||
1590 | dp = MatchBackRef(sp,end,ss+1,to,level); | ||
1591 | if(dp) | ||
1592 | return dp; | ||
1593 | m_Matches[i].m_End = offsave; | ||
1594 | return NULL; | ||
1595 | break; | ||
1596 | #ifdef_DEBUG | ||
1597 | default: | ||
1598 | ASSERT(FALSE); | ||
1599 | break; | ||
1600 | #endif | ||
1601 | } | ||
1602 | ASSERT(FALSE); | ||
1603 | return NULL;// Anyway - we can never get here.. | ||
1604 | } | ||
1605 | |||
1606 | #ifdef_DEBUG | ||
1607 | void CRegEx::CSop::Dump(CDumpContext& dc) | ||
1608 | { | ||
1609 | switch(m_Operator){ | ||
1610 | case opEnd: | ||
1611 | dc << "end"; | ||
1612 | break; | ||
1613 | case opChar: | ||
1614 | dc << "char('" << (char)m_Operand << "')"; | ||
1615 | break; | ||
1616 | case opBOL: | ||
1617 | dc << "BOL"; | ||
1618 | break; | ||
1619 | case opEOL: | ||
1620 | dc << "EOL"; | ||
1621 | break; | ||
1622 | case opAny: | ||
1623 | dc << "any"; | ||
1624 | break; | ||
1625 | case opAnyOf: | ||
1626 | dc << "anyOf(" << m_Operand << ")"; | ||
1627 | break; | ||
1628 | case opBackRef0: | ||
1629 | dc << "[ backref(" << m_Operand << ")"; | ||
1630 | break; | ||
1631 | case opBackRef1: | ||
1632 | dc << "] backref(" << m_Operand << ")"; | ||
1633 | break; | ||
1634 | case opPlus0: | ||
1635 | dc << "[ + (" << m_Operand << ")"; | ||
1636 | break; | ||
1637 | case opPlus1: | ||
1638 | dc << "] + (" << m_Operand << ")"; | ||
1639 | break; | ||
1640 | case opQuest0: | ||
1641 | dc << "[ ? (" << m_Operand << ")"; | ||
1642 | break; | ||
1643 | case opQuest1: | ||
1644 | dc << "] ? (" << m_Operand << ")"; | ||
1645 | break; | ||
1646 | case opLeftParen: | ||
1647 | dc << "[ ( (" << m_Operand << ")"; | ||
1648 | break; | ||
1649 | case opRightParen: | ||
1650 | dc << "] ) (" << m_Operand << ")"; | ||
1651 | break; | ||
1652 | case opChoice0: | ||
1653 | dc << "[ choice (" << m_Operand << ")"; | ||
1654 | break; | ||
1655 | case opOr0: | ||
1656 | dc << "[ | (" << m_Operand << ")"; | ||
1657 | break; | ||
1658 | case opOr1: | ||
1659 | dc << "] | (" << m_Operand << ")"; | ||
1660 | break; | ||
1661 | case opChoice1: | ||
1662 | dc << "] choice (" << m_Operand << ")"; | ||
1663 | break; | ||
1664 | case opBOW: | ||
1665 | dc << "BOW"; | ||
1666 | break; | ||
1667 | case opEOW: | ||
1668 | dc << "EOW"; | ||
1669 | break; | ||
1670 | } | ||
1671 | } | ||
1672 | void CRegEx::DumpStrip(CDumpContext& dc) | ||
1673 | { | ||
1674 | for(int tmp=0;tmp<m_Strip.GetSize();tmp++) | ||
1675 | dc << tmp << ": " << m_Strip[tmp] << ";\n"; | ||
1676 | } | ||
1677 | #endif | ||
1678 | |||
1679 | |||
1680 | CString CRegEx::GetMatch(int match) | ||
1681 | { | ||
1682 | CString rv; | ||
1683 | if(!m_Matches.GetSize()) | ||
1684 | return rv; | ||
1685 | ASSERT(m_Matches[0].m_Begin<m_Input.GetLength() && m_Matches[0].m_End<=m_Input.GetLength()); | ||
1686 | if(match==matchPreMatch) | ||
1687 | return m_Input.Left(m_Matches[0].m_Begin); | ||
1688 | if(match==matchPostMatch) | ||
1689 | return m_Input.Mid(m_Matches[0].m_End); | ||
1690 | if(match<0 || match >= m_Matches.GetSize()) | ||
1691 | return rv; | ||
1692 | if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0) | ||
1693 | return rv; | ||
1694 | ASSERT(m_Matches[match].m_Begin<m_Input.GetLength() && m_Matches[match].m_End<=m_Input.GetLength()); | ||
1695 | rv = m_Input.Mid(m_Matches[match].m_Begin,m_Matches[match].m_End-m_Matches[match].m_Begin); | ||
1696 | return rv; | ||
1697 | } | ||