summaryrefslogtreecommitdiffabout
path: root/shared-code/RegEx.cpp
Unidiff
Diffstat (limited to 'shared-code/RegEx.cpp') (more/less context) (ignore whitespace changes)
-rw-r--r--shared-code/RegEx.cpp1697
1 files changed, 1697 insertions, 0 deletions
diff --git a/shared-code/RegEx.cpp b/shared-code/RegEx.cpp
new file mode 100644
index 0000000..b7bab62
--- a/dev/null
+++ b/shared-code/RegEx.cpp
@@ -0,0 +1,1697 @@
1#include "../stdafx.h"
2#include "RegEx.h"
3
4 #defineisWordableChar(c) (isalnum(c) || (c)=='_')
5
6BOOL CRegEx::Compile(LPCTSTR regex,int flags)
7{
8 ASSERT(!((flags&regExtended) && (flags&regLiteral)));
9 m_Matches.RemoveAll();
10 m_Strip.RemoveAll();
11 m_Strip.SetSize(0,15);
12 m_Pattern=regex;
13 m_ParsePointer=0;
14 m_Error=0;
15 m_Sets.RemoveAll();
16 m_Flags=flags;
17 m_iFlags=0;
18 m_BOLs=m_EOLs=0;
19 m_Subexps = 0;
20 m_Categories=1;// 0 is 'everything else'
21 m_bBackRefs=FALSE;
22 memset(m_Category,0,sizeof(m_Category));
23
24 // Go ahead.
25 m_Error || m_Strip.Add(CSop(CSop::opEnd));
26 if(flags&regExtended){
27 ParseERE();
28 }else if(flags&regLiteral){
29 ParseLiteral();
30 }else{
31 ParseBRE();
32 }
33 m_Error || m_Strip.Add(CSop(CSop::opEnd));
34 Categorize();
35 m_Strip.FreeExtra();
36 FigureMust();
37 m_Pluses=CountPluses();
38 if(m_iFlags&iflagsBad){
39 m_Error || (m_Error=regeAssert);
40 // ??? point to nuls? ;-)
41 }
42 // We may wish to free some memory here if we're erroneous (ie. m_Error..)
43 m_ParseParens.RemoveAll();
44#ifdef _DEBUG
45 if(m_Error){
46 CString tmp;
47 tmp.Format("RE: ParseError: %d\n",m_Error);
48 TRACE0(tmp);
49 }
50 //DumpStrip(afxDump);
51#endif
52 return (m_bCompiled=(!m_Error));
53}
54
55BOOL CRegEx::Match(LPCTSTR src,int flags)
56{
57 if(!m_bCompiled)
58 return FALSE;
59 if(m_iFlags&iflagsBad)
60 return FALSE;
61 m_Input=src;
62 m_mFlags=flags;
63 m_mPointer=m_Input;
64 m_mBegin=m_Input;
65 m_mEnd=&m_mBegin[m_Input.GetLength()];
66 ASSERT(m_mPointer<=m_mEnd);
67 m_Matches.RemoveAll();
68 if(!m_Must.IsEmpty()){
69 if(m_Input.Find(m_Must)<0)
70 return FALSE;
71 }
72 // Go ahead..
73int stripLen = m_Strip.GetSize();
74 m_mLastPos.SetSize(0);
75 for(int tmp=0;tmp<stripLen;tmp++)
76 m_Strip[tmp].m_MatchData=0;
77LPCTSTR beginp = m_mBegin;
78LPCTSTR endp;
79 for(;;){
80 endp = MatchFast(beginp);
81 if(!endp)
82 return FALSE;
83 if((m_mFlags&regNoSubExpressions) && !m_bBackRefs)
84 break;
85 ASSERT(m_cOldP);
86 for(;;){
87 endp = MatchSlow(m_cOldP,m_mEnd,1,stripLen-1);
88 if(endp)
89 break;
90 ASSERT(m_cOldP<m_mEnd);
91 m_cOldP++;
92 }
93 if((m_mFlags&regOneMatch) && !m_bBackRefs)
94 break;
95 // Oh, his, we want the subexpression..
96 m_Matches.SetSize(m_Subexps+1);
97 LPCTSTR dp;
98 if(!m_bBackRefs && !(m_mFlags&regBackRefs)){
99 dp = MatchDissect(m_cOldP,endp,1,stripLen-1);
100 }else{
101 if(m_Pluses>0 && !m_mLastPos.GetSize())
102 m_mLastPos.SetSize(m_Pluses);
103 dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
104 }
105 if(dp)
106 break;
107 // Uh.. oh.. we couldn't find a subexpression-level match
108 ASSERT(m_bBackRefs);
109 ASSERT(m_Pluses==0 || m_mLastPos.GetSize());
110 for(;;){
111 if(dp || endp <= m_cOldP)
112 break;// defeat.. ?
113 endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1);
114 if(!endp)
115 break;// defeat.. ?
116 // Try it on a shorter possibility..
117#ifdef _DEBUG
118 for(tmp=1;tmp<=m_Subexps;tmp++)
119 ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0);
120#endif
121 dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
122 }
123 ASSERT((!dp) || dp==endp);
124 if(dp)// Found a shorter one..
125 break;
126 // Despite initial appearances, there is no match here
127 beginp = m_cOldP+1;
128 ASSERT(beginp<=m_mEnd);
129 }
130 // Fill in the detail if so requested..
131 if(!(m_mFlags&regNoSubExpressions)){
132 if(!m_Matches.GetSize())
133 m_Matches.SetSize(1);
134 m_Matches[0].m_Begin=m_cOldP-m_mBegin;
135 m_Matches[0].m_End=endp-m_mBegin;
136 }
137 m_mLastPos.RemoveAll();
138 return TRUE;
139}
140
141CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags)
142{
143 // ***
144 return CString();
145}
146
147void CRegEx::ParseERE(int stop)
148{
149UCHAR c;
150BOOL first=TRUE;
151int prevF, prevB;
152 for(;;){
153 int co = m_Strip.GetSize();
154 while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop)
155 ParseEREexp();
156 if(m_Strip.GetSize()==co){
157 m_Error || (m_Error=regeEmpty);
158 // ??? point to nuls?
159 }
160 if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|')
161 break;
162 else
163 m_ParsePointer++;
164 if(first){
165 StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1));
166 prevF = prevB = co;
167 first=FALSE;
168 }
169 m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB));
170 prevB = m_Strip.GetSize()-1;
171 m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
172 prevF = m_Strip.GetSize();
173 m_Error || m_Strip.Add(CSop(CSop::opOr1,0));// offset isn't really right.. very so..
174 }
175 if(!first){
176 m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
177 m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB));
178 }
179 ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop);
180}
181
182void CRegEx::ParseEREexp()
183{
184 ASSERT(m_ParsePointer < m_Pattern.GetLength());
185UCHAR c = m_Pattern[m_ParsePointer++];
186int pos = m_Strip.GetSize();
187int subno;
188int count, count2;
189BOOL wascaret=FALSE;
190 switch(c){
191 case '(':
192 if(!(m_ParsePointer<m_Pattern.GetLength())){
193 TRACE0("RE: '(' at the end of the pattern\n");
194 if(!m_Error)
195 m_Error = regeParen;
196 // ??? point to nuls?
197 }
198 m_Subexps++;
199 subno=m_Subexps;
200 m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
201 m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
202 if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')')
203 ParseERE(')');
204 VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
205 m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
206 if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){
207 TRACE0("RE: No matching ')'\n");
208 if(!m_Error)
209 m_Error = regeParen;
210 // ??? point to nuls?
211 }
212 break;
213 case '^':
214 m_Error || m_Strip.Add(CSop(CSop::opBOL));
215 m_iFlags|=iflagsUseBOL;
216 m_BOLs++;
217 wascaret=TRUE;
218 break;
219 case '$':
220 m_Error || m_Strip.Add(CSop(CSop::opEOL));
221 m_iFlags|=iflagsUseEOL;
222 m_EOLs++;
223 break;
224 case '|':
225 TRACE0("RE: '|' outside of expression\n");
226 if(!m_Error)
227 m_Error = regeEmpty;
228 // ??? point to nuls?
229 break;
230 case '*':
231 case '+':
232 case '?':
233 TRACE0("RE: '*'/'+'/'?' with no previous expression\n");
234 if(!m_Error)
235 m_Error = regeBadRepeat;
236 // ??? point to nuls?
237 break;
238 case '.':
239 if(m_Flags&regNewLine)
240 EmitNonNewLineAny();
241 else
242 m_Error || m_Strip.Add(CSop(CSop::opAny));
243 break;
244 case '[':
245 ParseBracket();
246 break;
247 case '\\':
248 if(m_ParsePointer >= m_Pattern.GetLength()){
249 TRACE0("RE: '\\' at the end of the pattern\n");
250 if(!m_Error)
251 m_Error = regeEscape;
252 // ??? point to nuls?
253 }else{
254 c = m_Pattern[m_ParsePointer++];
255 EmitOrdinary(c);
256 }
257 break;
258 case '{':
259 if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){
260 TRACE0("RE: '{' with no repeat count\n");
261 if(!m_Error)
262 m_Error = regeBadRepeat;
263 // ??? point to nuls?
264 }
265 // Fallthrough..
266 default:
267 EmitOrdinary(c);
268 break;
269 }
270 if(m_ParsePointer >= m_Pattern.GetLength())
271 return;
272 c = m_Pattern[m_ParsePointer];
273 // Call a '{' repetition if followed by a digit
274 if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) ))
275 return; // No repetitor - done.
276 m_ParsePointer++;
277 if(wascaret){
278 TRACE0("RE: repetitive '^' detected\n");
279 if(!m_Error)
280 m_Error = regeBadRepeat;
281 // ??? point to nuls?
282 }
283 switch(c){
284 case '*':// Implemeted as +?
285 // + expression
286 StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
287 m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
288 // ? expression
289 StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
290 m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
291 break;
292 case '+':
293 // + expression
294 StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
295 m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
296 break;
297 case '?':
298 // Kludge - emit y? as (y|) until subtle bug gets fixed :-)
299 StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1));
300 m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
301 m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
302 m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
303 m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
304 break;
305 case '{':
306 count = ParseCount();
307 if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
308 m_ParsePointer++;
309 if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available
310 count2=ParseCount();
311 if(!(count<=count2)){
312 TRACE0("RE: Disbalanced counts in '{}' repeater\n");
313 m_Error || (m_Error=regeBadBrace);
314 // ??? point to nuls?
315 }
316 }else // Single number with comma
317 count2=256; // Infinity
318 }else // Single number
319 count2=count;
320 EmitRepeat(pos,count,count2);
321 if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){
322 // No '}'..
323 TRACE0("RE: No immediately following '}' of '{' expression\n");
324 while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}')
325 m_ParsePointer++;
326 if(m_ParsePointer >= m_Pattern.GetLength()){
327 TRACE0("RE: No closing '}' found\n");
328 m_Error || (m_Error=regeBrace);
329 }else
330 m_Error || (m_Error=regeBadBrace);
331 // ??? point to nuls?
332 }else
333 m_ParsePointer++;
334 break;
335 }
336 if(m_ParsePointer >= m_Pattern.GetLength())
337 return;
338 c = m_Pattern[m_ParsePointer];
339 if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1]))))
340 return;
341 TRACE0("RE: Double repeater\n");
342 m_Error || (m_Error=regeBadRepeat);
343 // ??? point to nuls?
344}
345
346void CRegEx::StripInsert(int pos,CSop& sop)
347{
348 if(m_Error)
349 return;
350int sn = m_Strip.GetSize();
351 m_Strip.InsertAt(pos,sop);
352 for(int tmp=1;tmp<m_ParseParens.GetSize();tmp++){
353 if(m_ParseParens[tmp].m_Begin>=pos)
354 m_ParseParens[tmp].m_Begin++;
355 if(m_ParseParens[tmp].m_End>=pos)
356 m_ParseParens[tmp].m_End++;
357 }
358}
359
360void CRegEx::EmitOrdinary(UCHAR c)
361{
362 if(m_Flags&regIgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){
363 // Emit both cases
364 CString savePattern = m_Pattern;
365 int savePointer = m_ParsePointer;
366 m_Pattern=c;
367 m_Pattern+=']';
368 m_ParsePointer=0;
369 ParseBracket();
370 m_Pattern=savePattern;
371 m_ParsePointer=savePointer;
372 }else{
373 m_Error || m_Strip.Add(CSop(CSop::opChar,c));
374 if(!m_Category[(BYTE)c])
375 m_Category[(BYTE)c]=m_Categories++;
376 }
377}
378
379void CRegEx::EmitNonNewLineAny()
380{
381 // Kludges're going on and on..
382CString savePattern = m_Pattern;
383int savePointer = m_ParsePointer;
384 m_Pattern="^\n]";
385 m_ParsePointer=0;
386 ParseBracket();
387 m_Pattern=savePattern;
388 m_ParsePointer=savePointer;
389}
390
391int CRegEx::ParseCount()
392{
393BOOL nonEmpty=FALSE;
394int rv = 0;
395UCHAR c;
396 while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){
397 rv = rv*10 + c-'0';
398 nonEmpty=TRUE;
399 m_ParsePointer++;
400 }
401 if(rv>255 || !nonEmpty){
402 m_Error || (m_Error=regeBadBrace);
403 // ??? point to nuls?
404 }
405 return rv;
406}
407
408void CRegEx::ParseBracket()
409{
410 // Dept. of truly sickening special case kludges
411 if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){
412 m_Error || m_Strip.Add(CSop(CSop::opBOW));
413 m_ParsePointer+=6;
414 return;
415 }
416 if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){
417 m_Error || m_Strip.Add(CSop(CSop::opEOW));
418 m_ParsePointer+=6;
419 return;
420 }
421BOOL invert=TRUE;
422 if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^')
423 m_ParsePointer++;
424 else
425 invert=FALSE;
426CSet cset;
427 if(m_ParsePointer < m_Pattern.GetLength()){
428 switch(m_Pattern[m_ParsePointer]){
429 case ']':
430 case '-':
431 cset.Add(m_Pattern[m_ParsePointer]);
432 m_ParsePointer++;
433 break;
434 }
435 }
436 while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("-]")))
437 ParseBracketTerm(cset);
438 if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
439 m_ParsePointer++;
440 cset.Add('-');
441 }
442 if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==']')
443 m_ParsePointer++;
444 else{
445 m_Error || (m_Error=regeBracket);
446 // ??? point to nuls?
447 return;
448 }
449 if(m_Flags&regIgnoreCase){
450 for(int tmp=CSet::size-1;tmp>=0;tmp--){
451 if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp)))
452 cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp));
453 }
454 /*
455 if(!cset->m_Multis.IsEmpty())
456 cset.CollatingCase();
457 */
458 }
459 if(invert){
460 for(int tmp=CSet::size-1;tmp>=0;tmp--)
461 if(cset.IsIn(tmp))
462 cset.Sub(tmp);
463 else
464 cset.Add(tmp);
465 if(m_Flags&regNewLine)
466 cset.Sub('\n');
467 /*
468 if(!cset.m_Multis.IsEmpty())
469 cset.CollatingInvert();
470 */
471 }
472UCHAR c = cset.GetOnly();
473 if(c){
474 EmitOrdinary(c);
475 }else
476 m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset)));
477}
478
479void CRegEx::CSet::Add(UCHAR c)
480{
481 m_Set[(BYTE)c]=TRUE;
482 m_Hash+=c;
483}
484
485BOOL CRegEx::CSet::IsIn(UCHAR c)
486{
487 return m_Set[(BYTE)c];
488}
489
490void CRegEx::CSet::Sub(UCHAR c)
491{
492 m_Set[(BYTE)c]=FALSE;
493 m_Hash-=c;
494}
495
496UCHAR CRegEx::CSet::GetOnly()
497{
498int rv = 0;
499UCHAR only = 0;
500 for(int tmp=0;tmp<size;tmp++){
501 rv+=m_Set[tmp]?(only=tmp,1):0;
502 }
503 return (rv==1)?only:0;
504}
505
506int CRegEx::StoreSet(CSet& cset)
507{
508 for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
509 if(m_Sets[tmp]==cset)
510 return tmp;
511 return m_Sets.Add(cset);
512}
513
514void CRegEx::ParseBracketTerm(CSet& cset)
515{
516UCHAR c;
517 switch((m_ParsePointer<m_Pattern.GetLength())?m_Pattern[m_ParsePointer]:0){
518 case '[':
519 c = ((m_ParsePointer+1)<m_Pattern.GetLength())?m_Pattern[m_ParsePointer+1]:0;
520 break;
521 case '-':
522 m_Error || (m_Error=regeRange);
523 // ??? point to nuls?
524 return;
525 default:
526 c = 0;
527 break;
528 }
529 switch(c){
530 case ':':// Character class
531 m_ParsePointer+=2;
532 if(m_ParsePointer>=m_Pattern.GetLength()){
533 m_Error || (m_Error=regeBracket);
534 // ??? point to nuls?
535 }
536 c = m_Pattern[m_ParsePointer];
537 if(c== '-' || c==']'){
538 m_Error || (m_Error=regeCType);
539 // ??? point to nuls?
540 }
541 ParseBracketCClass(cset);
542 if(m_ParsePointer>=m_Pattern.GetLength()){
543 m_Error || (m_Error=regeBracket);
544 // ??? point to nuls?
545 }
546 if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){
547 m_Error || (m_Error=regeCType);
548 // ??? point to nuls?
549 }else
550 m_ParsePointer+=2;
551 break;
552 case '=':// Equivalence class
553 m_ParsePointer+=2;
554 if(m_ParsePointer >= m_Pattern.GetLength()){
555 m_Error || (m_Error=regeBracket);
556 // ??? point to nuls?
557 }
558 c = m_Pattern[m_ParsePointer];
559 if(c== '-' || c==']'){
560 m_Error || (m_Error=regeCollate);
561 // ??? point to nuls?
562 }
563 ParseBracketEClass(cset);
564 if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){
565 m_Error || (m_Error=regeCollate);
566 // ??? point to nuls?
567 }else
568 m_ParsePointer+=2;
569 break;
570 default:// Symbol, character or range
571 {
572 UCHAR start, finish;
573 start = ParseBracketSymbol();
574 if((m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-') /*&& (m_ParsePointer+1)<m_Pattern.GetLength() && m_Pattern[m_ParsePointer+1]==']'*/){
575 // I believe the expression above is seetwo..
576 // range.
577 m_ParsePointer++;
578 if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
579 m_ParsePointer++;
580 finish='-';
581 }else
582 finish=ParseBracketSymbol();
583 }else
584 finish=start;
585 if(((BYTE)start)>((BYTE)finish)){
586 m_Error || (m_Error=regeRange);
587 // ??? point to nuls?
588 }
589 for(int tmp=start;tmp<=(BYTE)finish;tmp++)
590 cset.Add(tmp);
591 }
592 break;
593 }
594}
595
596void CRegEx::ParseBracketCClass(CSet& cset)
597{
598 static struct{
599 char *className;
600 char *classChars;
601 }cc[] = {
602 {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"},
603 {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
604 {"blank"," \t"},
605 {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"},
606 {"digit","0123456789"},
607 {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
608 {"lower","abcdefghijklmnopqrstuvwxyz"},
609 {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "},
610 {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
611 {"space","\t\n\v\f\r "},
612 {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
613 {"xdigit","0123456789ABCDEFabcdef"}
614};
615CString cclass;
616UCHAR c;
617 while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){
618 cclass+=c;
619 m_ParsePointer++;
620 }
621char *classChars = NULL;
622 for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){
623 if(!cclass.CompareNoCase(cc[tmp].className)){
624 classChars=cc[tmp].classChars;
625 break;
626 }
627 }
628 if(!classChars){
629 m_Error || (m_Error=regeCType);
630 // ??? point to nuls?
631 return;
632 }
633 while(*classChars)
634 cset.Add(*(classChars++));
635 // --- multis
636}
637
638void CRegEx::ParseBracketEClass(CSet& cset)
639{
640 cset.Add(ParseBracketCollatingElement('='));;
641}
642
643UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term)
644{
645 static struct{
646 char *entityName;
647 char entity;
648 }cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} };
649CString seeTwo;
650 seeTwo=term;
651 seeTwo+=']';
652CString entityName;
653 while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(seeTwo)))
654 entityName+=m_Pattern[m_ParsePointer++];
655 if(m_ParsePointer>=m_Pattern.GetLength()){
656 m_Error || (m_Error=regeBracket);
657 // ??? point to nuls?
658 return 0;
659 }
660 for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++)
661 if(!entityName.CompareNoCase(cc[tmp].entityName))
662 return cc[tmp].entity;
663 if(entityName.GetLength()==1)
664 return entityName[0];
665 m_Error || (m_Error=regeCollate);
666 // ??? point to nuls?
667 return 0;
668}
669
670UCHAR CRegEx::ParseBracketSymbol()
671{
672 if(m_ParsePointer>=m_Pattern.GetLength()){
673 m_Error || (m_Error=regeBracket);
674 // ??? point to nuls?
675 }
676 if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
677 m_ParsePointer+=2;
678 else
679 return m_Pattern[m_ParsePointer++];
680 // Collating symbol
681UCHAR rv = ParseBracketCollatingElement('.');
682 if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
683 m_ParsePointer+=2;
684 else{
685 m_Error || (m_Error=regeCollate);
686 // ??? point to nuls?
687 }
688 return rv;
689}
690
691void CRegEx::EmitRepeat(int pos,int from,int to)
692{
693 if(m_Error)
694 return;
695 ASSERT(from<=to);
696int finish = m_Strip.GetSize();
697int copy;
698 #defineN 2
699#define INF 3
700#define REP(f,t) ((f)*8+(t))
701#define MAP(n) (((n)<=1)?(n):((n)==256)?INF:N)
702 switch(REP(MAP(from),MAP(to))){
703 case REP(0,0):// must be user doing ths??
704 m_Strip.SetSize(pos);
705 break;
706 case REP(0,1):// as in '?'
707 case REP(0,N):// as in '{1,n}?'
708 case REP(0,INF):// as in '{1,}?'
709 // Kludge - emit y? as (y|) until something gets fixed..
710 StripInsert(pos,CSop(CSop::opChoice0,pos));
711 EmitRepeat(pos+1,1,to);
712 m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
713 m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
714 m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
715 m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
716 break;
717 case REP(1,1):
718 break;
719 case REP(1,N):// as in 'x?x{1,n-1}'
720 // Kludge again..
721 StripInsert(pos,CSop(CSop::opChoice0,pos));
722 m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
723 m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
724 m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
725 m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
726 copy = StripDuplicate(pos+1,finish+1);
727 ASSERT(copy==(finish+4));
728 EmitRepeat(copy,1,to-1);
729 break;
730 case REP(1,INF): // as in '+'
731 StripInsert(pos,CSop(CSop::opPlus0,pos));
732 m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
733 break;
734 case REP(N,N):// as in 'xx{from-1,to-1}'
735 copy = StripDuplicate(pos,finish);
736 EmitRepeat(copy,from-1,to-1);
737 break;
738 case REP(N,INF): // as in 'xx{n-1,}'
739 copy = StripDuplicate(pos,finish);
740 EmitRepeat(copy,from-1,to);
741 break;
742#ifndef NDEBUG
743 default:
744 ASSERT(FALSE);
745 break;
746#endif
747 }
748#undef MAP
749#undef REP
750#undef INF
751#undef N
752}
753
754int CRegEx::StripDuplicate(int from,int to)
755{
756int rv = m_Strip.GetSize();
757 ASSERT(from<=to);
758 if(from==to)
759 return rv;
760 // Maybe should be optimized for copying the whole thing.
761 for(int tmp=from;tmp<to;tmp++)
762 m_Strip.Add(m_Strip[tmp]);
763 return rv;
764}
765
766void CRegEx::Categorize()
767{
768 if(m_Error)
769 return;
770 for(int tmp=0;tmp<(sizeof(m_Category)/sizeof(m_Category[0]));tmp++)
771 if((!m_Category[tmp]) && IsInSets(tmp)){
772 int cat = m_Categories++;
773 m_Category[tmp]=cat;
774 for(int c=tmp+1;c<(sizeof(m_Category)/sizeof(m_Category[0]));c++)
775 if((!m_Category[c]) && IsInSameSets(tmp,c))
776 m_Category[c]=cat;
777 }
778}
779
780BOOL CRegEx::IsInSets(UCHAR c)
781{
782 for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
783 if(m_Sets[tmp].IsIn(c))
784 return TRUE;
785 return FALSE;
786}
787
788BOOL CRegEx::IsInSameSets(UCHAR c1,UCHAR c2)
789{
790 for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
791 if(m_Sets[tmp].IsIn(c1)!=m_Sets[tmp].IsIn(c2))
792 return FALSE;
793 return TRUE;
794}
795
796void CRegEx::FigureMust()
797{
798 if(m_Error)
799 return;
800 m_Must.Empty();
801int stripLen = m_Strip.GetSize();
802int seqStart, seqLength = 0;
803int mustStart, mustLength = 0;
804 for(int tmp=1;tmp<stripLen;tmp++){
805 switch(m_Strip[tmp].m_Operator){
806 case CSop::opChar:
807 if(!seqLength)
808 seqStart=tmp;
809 seqLength++;
810 break;
811 case CSop::opPlus0:
812 case CSop::opLeftParen:
813 case CSop::opRightParen:
814 break;// Break, since they don't break the sequence
815 case CSop::opQuest0:
816 case CSop::opChoice0:
817 // These ones we skip.
818 do{
819 tmp+=m_Strip[tmp].m_Operand;
820 // I still think it could be ASSERTed..
821 if(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1 && m_Strip[tmp].m_Operator!=CSop::opOr1){
822 m_iFlags|=iflagsBad;
823 return;
824 }
825 }while(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1);
826 // Fallthrough..
827 default:
828 // End of sequence
829 if(seqLength>mustLength){
830 mustStart=seqStart;
831 mustLength=seqLength;
832 }
833 seqLength=0;
834 break;
835 }
836 }// Hmm.. originally it's meant to be do while not opEnd..
837 if(!mustLength)
838 return;
839 // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not..
840 for(tmp=0;tmp<seqLength;tmp++){
841 while(m_Strip[seqStart+tmp].m_Operator!=CSop::opChar)
842 ASSERT(tmp<seqLength);
843 m_Must+=m_Strip[tmp].m_Operand;
844 }
845}
846
847int CRegEx::CountPluses()
848{
849 if(m_Error)
850 return 0;
851int stripLen = m_Strip.GetSize();
852int rv = 0;
853int nest = 0;
854 for(int tmp=0;tmp<stripLen;tmp++){
855 switch(m_Strip[tmp].m_Operator){
856 case CSop::opPlus0:
857 nest++;
858 break;
859 case CSop::opPlus1:
860 if(nest>rv)
861 rv = nest;
862 nest--;
863 break;
864 }
865 }// Again, originally we were supposed to scan till opEnd..
866 if(nest)
867 m_iFlags|=iflagsBad;// Could this be an ASSERTion?
868 return rv;
869}
870
871void CRegEx::ParseLiteral()
872{
873 if(!m_Pattern.GetLength()){
874 m_Error || (m_Error=regeEmpty);
875 // ??? point to nuls?
876 }
877 while(m_ParsePointer < m_Pattern.GetLength())
878 EmitOrdinary(m_Pattern[m_ParsePointer++]);
879}
880
881void CRegEx::ParseBRE(int stopa,int stopb)
882{
883int start = m_Strip.GetSize();
884BOOL first=TRUE;
885BOOL wasDollar=FALSE;
886 if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^'){
887 m_ParsePointer++;
888 m_Error || m_Strip.Add(CSop(CSop::opBOL,0));
889 m_iFlags|=iflagsUseBOL;
890 m_BOLs++;
891 }
892CString stopstr;
893 if(stopa){
894 stopstr+=stopa;
895 if(stopb)
896 stopstr+=stopb;
897 }
898 while(m_ParsePointer < m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(stopstr))){
899 wasDollar = ParseBREexp(first);
900 first=FALSE;
901 }
902 if(wasDollar){// Trailing anchor that was..
903 m_Strip.SetSize(m_Strip.GetSize()-1);
904 m_Error || m_Strip.Add(CSop(CSop::opEOL,0));
905 m_iFlags|=iflagsUseEOL;
906 m_EOLs++;
907 }
908 if(m_Strip.GetSize()==start){
909 m_Error || (m_Error=regeEmpty);
910 // ??? point to nuls?
911 }
912}
913
914BOOL CRegEx::ParseBREexp(BOOL ordinaryStar)
915{
916int subno;
917int pos = m_Strip.GetSize();
918 ASSERT(m_ParsePointer<m_Pattern.GetLength());
919int c = m_Pattern[m_ParsePointer++];
920 if(c=='\\'){
921 if(m_ParsePointer>=m_Pattern.GetLength()){
922 m_Error || (m_Error=regeEscape);
923 // ??? point to nuls
924 }else
925 c = 0x100|m_Pattern[m_ParsePointer++];
926 }
927 switch(c){
928 case '.':
929 if(m_Flags&regNewLine)
930 EmitNonNewLineAny();
931 else
932 m_Error || m_Strip.Add(CSop(CSop::opAny,0));
933 break;
934 case '[':
935 ParseBracket();
936 break;
937 case 0x100|'{':
938 m_Error || (m_Error=regeBadRepeat);
939 // ??? point to nuls?
940 break;
941 case 0x100|'(':
942 m_Subexps++;
943 subno=m_Subexps;
944 m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
945 m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
946 if(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)")))
947 ParseBRE('\\',')');
948 VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
949 m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
950 if((m_ParsePointer+1) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)"))
951 m_ParsePointer+=2;
952 else{
953 m_Error || (m_Error=regeParen);
954 // ??? point to nuls?
955 }
956 break;
957 case 0x100|')':
958 case 0x100|'}':
959 // Can this possibly happen?!
960 m_Error || (m_Error=regeParen);
961 // ??? point to nuls?
962 break;
963 case 0x100|'1':
964 case 0x100|'2':
965 case 0x100|'3':
966 case 0x100|'4':
967 case 0x100|'5':
968 case 0x100|'6':
969 case 0x100|'7':
970 case 0x100|'8':
971 case 0x100|'9':
972 {
973 int i = (c&0xFF)-'0';
974 if(i < m_ParseParens.GetSize() && m_ParseParens[i].m_End){
975 m_Error || m_Strip.Add(CSop(CSop::opBackRef0,i));
976 ASSERT(m_ParseParens[i].m_Begin);
977 ASSERT(m_Strip[m_ParseParens[i].m_Begin].m_Operator==CSop::opLeftParen);
978 ASSERT(m_Strip[m_ParseParens[i].m_End].m_Operator==CSop::opRightParen);
979 StripDuplicate(m_ParseParens[i].m_Begin+1,m_ParseParens[i].m_End);
980 m_Error || m_Strip.Add(CSop(CSop::opBackRef1,i));
981 }else{
982 m_Error || (m_Error=regeSubReg);
983 // ??? point to nuls?
984 }
985 m_bBackRefs=TRUE;
986 }
987 break;
988 case '*':
989 if(!ordinaryStar){
990 m_Error || (m_Error=regeBadRepeat);
991 // ??? point to nuls?
992 }
993 // Fallthrough..
994 default:
995 EmitOrdinary(c&0xFF);
996 break;
997 }
998 if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='*'){
999 m_ParsePointer++;
1000 // as in '+?'
1001 StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
1002 m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
1003 StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
1004 m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
1005 }else if ((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\{")){
1006 m_ParsePointer+=2;
1007 int count = ParseCount();
1008 int count2;
1009 if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
1010 m_ParsePointer++;
1011 if(m_ParsePointer<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer])){
1012 count2=ParseCount();
1013 if(count>count2){
1014 m_Error || (m_Error=regeBadBrace);
1015 // ??? poin to nuls?
1016 }
1017 }else // Single number with comma
1018 count2=256;
1019 }else // Single number
1020 count2=count;
1021 EmitRepeat(pos,count,count2);
1022 if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){
1023 while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")))
1024 m_ParsePointer++;
1025 if(m_ParsePointer>=m_Pattern.GetLength()){
1026 m_Error || (m_Error=regeBrace);
1027 // ??? point to nuls?
1028 }
1029 m_Error || (m_Error=regeBadBrace);
1030 }else
1031 m_ParsePointer+=2;
1032 }else if(c=='$')
1033 return TRUE;
1034 return FALSE;
1035}
1036
1037CRegEx::CRegEx()
1038{
1039 m_bCompiled=FALSE;
1040}
1041
1042LPCTSTR CRegEx::MatchFast(LPCTSTR begin)
1043{
1044 MatchStatesClear(CSop::stCurrent);
1045 m_Strip[1].m_MatchData|=CSop::stCurrent;
1046int stripLen = m_Strip.GetSize();
1047 MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent);
1048 MatchStatesCopy(CSop::stFresh,CSop::stCurrent);
1049LPCTSTR coldp = NULL;
1050LPCTSTR p = begin;
1051int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]);
1052 for(;;){
1053 // next character..
1054 int lastc = c;
1055 c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
1056 if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh))
1057 coldp=p;
1058 // Is there an EOL and/or BOL between lastc and c? - they ask..
1059 intflagc=0;
1060 inti = 0;
1061 if((lastc=='\n' && m_Flags&regNewLine) || (lastc==charOut && !(m_mFlags&regNotBOL))){
1062 flagc=charBOL;
1063 i=m_BOLs;
1064 }
1065 if((c=='\n' && m_Flags&regNewLine) || (c==charOut && !(m_mFlags&regNotEOL))){
1066 flagc=(flagc==charBOL)?charBOLEOL:charEOL;
1067 i+=m_EOLs;
1068 }
1069 if(i){
1070 for(;i>0;i--)
1071 MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
1072 }
1073 // What about a word boundary? - they wonder..
1074 if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c)))
1075 flagc = charBOW;
1076 if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
1077 flagc = charEOW;
1078 if(flagc==charBOW || flagc==charEOW){
1079 MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
1080 }
1081 // Are we done? Now WE wonder..
1082 if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd)
1083 break;// They insist I need to note break out.. Okay, I do..
1084 // Nope, we're not done. We have to face this character..
1085 MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
1086 MatchStatesCopy(CSop::stCurrent,CSop::stFresh);
1087 ASSERT(c!=charOut);
1088 MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent);
1089 p++;
1090 }
1091 ASSERT(coldp);
1092 m_cOldP=coldp;// *** I believe this variable can be changed 'in-place'
1093 if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent)
1094 return &p[1];
1095 else
1096 return NULL;
1097}
1098
1099void CRegEx::MatchStatesClear(BYTE mask)
1100{
1101int stripLen = m_Strip.GetSize();
1102 for(int tmp=0;tmp<stripLen;tmp++)
1103 m_Strip[tmp].m_MatchData&=~mask;
1104}
1105
1106void CRegEx::MatchStep(int from,int to,BYTE maskBefore,int charCode,BYTE maskAfter)
1107{
1108BOOL i;
1109int look;
1110int here = from;
1111 for(int pc=from;pc!=to;pc++,here++){
1112 CSop s=m_Strip[pc];
1113 switch(s.m_Operator){
1114 case CSop::opEnd:
1115 ASSERT(pc==(to-1));
1116 break;
1117 case CSop::opChar:
1118 // Only characters can match..
1119 ASSERT((charCode<charOut) || charCode!=s.m_Operand);
1120 if(charCode==s.m_Operand)
1121 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1122 break;
1123 case CSop::opBOL:
1124 if(charCode==charBOL || charCode==charBOLEOL)
1125 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1126 break;
1127 case CSop::opEOL:
1128 if(charCode==charEOL || charCode==charBOLEOL)
1129 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1130 break;
1131 case CSop::opBOW:
1132 if(charCode==charBOW)
1133 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1134 break;
1135 case CSop::opEOW:
1136 if(charCode==charEOW)
1137 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1138 break;
1139 case CSop::opAny:
1140 if(charCode<charOut)
1141 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1142 break;
1143 case CSop::opAnyOf:
1144 if(charCode<charOut && m_Sets[s.m_Operand].m_Set[charCode])
1145 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
1146 break;
1147 case CSop::opBackRef0:// Ignored here..
1148 case CSop::opBackRef1:
1149 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1150 break;
1151 case CSop::opPlus0:
1152 // Forward, this is just an empty, comment says..
1153 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1154 break;
1155 case CSop::opPlus1:
1156 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1157 i =(m_Strip[here-s.m_Operand].m_MatchData&maskAfter)!=0;
1158 m_Strip[here-s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1159 if(!i && (m_Strip[here-s.m_Operand].m_MatchData&maskAfter)){
1160 // oho, must reconsider loop body, comment says.. what's so 'oho' about it?
1161 pc-=s.m_Operand+1;
1162 here=pc;
1163 }
1164 break;
1165 case CSop::opQuest0:
1166 // two branches, both forward..
1167 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1168 m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1169 break;
1170 case CSop::opQuest1:
1171 // just an empty.. aren't we tired of justanempties?
1172 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1173 break;
1174 case CSop::opLeftParen: // they say it's not significan there..
1175 case CSop::opRightParen:
1176 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1177 break;
1178 case CSop::opChoice0:// mark the first two branches..
1179 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1180 ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
1181 m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1182 break;
1183 case CSop::opOr0:// done a branch, find the end of choice..
1184 if(m_Strip[here].m_MatchData&maskAfter){
1185 for(look=1;(s=m_Strip[pc+look]).m_Operator!=CSop::opChoice1;look+=s.m_Operand)
1186 ASSERT(s.m_Operator==CSop::opOr1);
1187 m_Strip[here+look].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1188 }
1189 break;
1190 case CSop::opOr1: // Propagate Choice's marking..
1191 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1192 if(m_Strip[pc+s.m_Operand].m_Operator!=CSop::opChoice1){
1193 ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
1194 m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1195 }
1196 break;
1197 case CSop::opChoice1: // Just empty.. :-)..
1198 m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
1199 break;
1200#ifdef _DEBUG
1201 default:
1202 ASSERT(FALSE);
1203 break;
1204#endif
1205 }
1206 }
1207}
1208
1209void CRegEx::MatchStatesCopy(BYTE dst,BYTE src)
1210{
1211int stripLen = m_Strip.GetSize();
1212 for(int tmp=0;tmp<stripLen;tmp++){
1213 // I believe this can be optimized, easily..
1214 m_Strip[tmp].m_MatchData&=~dst;
1215 m_Strip[tmp].m_MatchData|=(m_Strip[tmp].m_MatchData&src)?dst:0;
1216 }
1217}
1218
1219BOOL CRegEx::MatchStatesEqual(BYTE m1,BYTE m2)
1220{
1221int stripLen = m_Strip.GetSize();
1222 for(int tmp=0;tmp<stripLen;tmp++){
1223 BYTE mm = m_Strip[tmp].m_MatchData;
1224 if(((mm&m1) && (mm&m2)) || !(mm&(m1|m2)))
1225 continue;
1226 return FALSE;
1227 }
1228 return TRUE;
1229}
1230
1231LPCTSTR CRegEx::MatchSlow(LPCTSTR begin,LPCTSTR end,int from,int to)
1232{
1233 MatchStatesClear(CSop::stCurrent);
1234 m_Strip[from].m_MatchData|=CSop::stCurrent;
1235 MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
1236LPCTSTR mp = NULL;
1237int c = (m_mBegin==m_mPointer)?charOut:(int)(BYTE)begin[-1];
1238LPCTSTR p = begin;
1239 for(;;){
1240 // next character..
1241 int lastc = c;
1242 c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
1243 // Now we start to wonder if there is an EOL and/or BOL between lastc and c
1244 int flagc = 0;
1245 int i = 0;
1246 if((lastc=='\n' && m_Flags&regNewLine) || (lastc==charOut && !(m_mFlags&regNotBOL))){
1247 flagc = charBOL;
1248 i = m_BOLs;
1249 }
1250 if((c=='\n' && m_Flags&regNewLine) || (c==charOut && !(m_mFlags&regNotEOL))){
1251 flagc = (flagc==charBOL)?charBOLEOL:charEOL;
1252 i+=m_EOLs;
1253 }
1254 if(i){
1255 for(;i>0;i--)
1256 MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
1257 }
1258 // Now we wonder about word boundaries..
1259 if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c)))
1260 flagc=charBOW;
1261 if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
1262 flagc=charEOW;
1263 if(flagc==charBOW || flagc==charEOW){
1264 MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
1265 }
1266 // Are we done we all wonder??
1267 if(m_Strip[to].m_MatchData&CSop::stCurrent)
1268 mp=p;
1269 if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end)
1270 break;// Again, we're obliged to note break out. We do.
1271 // Sucks.. we have to face this character..
1272 MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
1273 MatchStatesCopy(CSop::stCurrent,CSop::stEmpty);
1274 ASSERT(c!=charOut);
1275 MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent);
1276 MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
1277 p++;
1278 }
1279 return mp;
1280}
1281
1282LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to)
1283{
1284LPCTSTR sp = begin, dp;
1285LPCTSTR stp, rest, tail, ssp, oldssp, sep;
1286int ssub, esub;
1287int es;
1288int i;
1289 for(int ss=from;ss<to;ss = es){
1290 // Identify end of SubRE
1291 es = ss;
1292 switch(m_Strip[es].m_Operator){
1293 case CSop::opPlus0:
1294 case CSop::opQuest0:
1295 es+=m_Strip[es].m_Operand;
1296 break;
1297 case CSop::opChoice0:
1298 while(m_Strip[es].m_Operator!=CSop::opChoice1)
1299 es+=m_Strip[es].m_Operand;
1300 break;
1301 }
1302 es++;
1303 // Figure out what it matched
1304 switch(m_Strip[ss].m_Operator){
1305 case CSop::opEnd:
1306 ASSERT(FALSE);
1307 break;
1308 case CSop::opChar:
1309 sp++;
1310 break;
1311 case CSop::opBOL:
1312 case CSop::opEOL:
1313 case CSop::opBOW:
1314 case CSop::opEOW:
1315 break;
1316 case CSop::opAny:
1317 case CSop::opAnyOf:
1318 sp++;
1319 break;
1320 case CSop::opBackRef0:
1321 case CSop::opBackRef1:
1322 ASSERT(FALSE);
1323 break;
1324 // Cases where lenght of match is hard to find..
1325 case CSop::opQuest0:
1326 stp=end;
1327 for(;;){
1328 // How long could this one be??
1329 rest = MatchSlow(sp,stp,ss,es);
1330 ASSERT(rest);// It did match.. It should've..
1331 // Could the rest match the rest? (good question)
1332 tail = MatchSlow(rest,end,es,to);
1333 if(tail==end)
1334 break;// yup.
1335 // nope, try a shorter match for this one..
1336 stp=rest-1;
1337 ASSERT(stp>=sp);// It did work.. It should've..
1338 }
1339 ssub=ss+1;
1340 esub=es-1;
1341 // Did innards match?
1342 if(MatchSlow(sp,rest,ssub,esub)){
1343 dp = MatchDissect(sp,rest,ssub,esub);
1344 ASSERT(dp==rest);
1345 }else// nope..
1346 ASSERT(sp==rest);
1347 sp = rest;
1348 break;
1349 case CSop::opPlus0:
1350 stp=end;
1351 for(;;){
1352 // How long could this one be??
1353 rest = MatchSlow(sp,stp,ss,es);
1354 ASSERT(rest);// It did.. It should've..
1355 // Could the rest match the rest?
1356 tail = MatchSlow(rest,end,es,to);
1357 if(tail==end)
1358 break;// yup.
1359 // nope..
1360 stp=rest-1;
1361 ASSERT(stp>=sp);// It should've worked, we believe..
1362 }
1363 ssub=ss+1;
1364 esub=es-1;
1365 ssp=sp;
1366 oldssp=ssp;
1367 for(;;){// Find last match of innards..
1368 sep = MatchSlow(ssp,rest,ssub,esub);
1369 if((!sep) || sep==ssp)
1370 break; // Failed or matched nothin'
1371 oldssp=ssp;
1372 ssp=sep;
1373 }
1374 if(!sep){
1375 // Last successfull..
1376 sep=ssp;
1377 ssp=oldssp;
1378 }
1379 ASSERT(sep=rest);// Must exhaust substring they say..
1380 VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest);// Assert or verify - that is the question..
1381 dp = MatchDissect(ssp,sep,ssub,esub);
1382 ASSERT(dp==sep);
1383 sp=rest;
1384 break;
1385 case CSop::opChoice0:
1386 stp = end;
1387 for(;;){
1388 // how long..
1389 rest = MatchSlow(sp,stp,ss,es);
1390 ASSERT(rest);
1391 // Could it..
1392 tail = MatchSlow(rest,end,es,to);
1393 if(tail==end)
1394 break;// y
1395 // n
1396 stp = rest-1;
1397 ASSERT(stp>=sp);
1398 }
1399 ssub=ss+1;
1400 esub=ss+m_Strip[ss].m_Operand-1;
1401 ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
1402 for(;;){// Find first matching branch..
1403 if(MatchSlow(sp,rest,ssub,esub)==rest)
1404 break;
1405 // this one missed, try next..
1406 ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
1407 esub++;
1408 ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
1409 ssub=esub+1;
1410 esub+=m_Strip[esub].m_Operand;
1411 if(m_Strip[esub].m_Operator==CSop::opOr1)
1412 esub--;
1413 else
1414 ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
1415 }
1416 dp = MatchDissect(sp,rest,ssub,esub);
1417 ASSERT(dp==rest);
1418 sp=rest;
1419 break;
1420 case CSop::opPlus1:
1421 case CSop::opQuest1:
1422 case CSop::opOr0:
1423 case CSop::opOr1:
1424 case CSop::opChoice1:
1425 ASSERT(FALSE);
1426 break;
1427 case CSop::opLeftParen:
1428 i = m_Strip[ss].m_Operand;
1429 ASSERT(0<i && i<=m_Subexps);
1430 m_Matches[i].m_Begin=sp-m_mBegin;
1431 break;
1432 case CSop::opRightParen:
1433 i = m_Strip[ss].m_Operand;
1434 ASSERT(0<i && i<=m_Subexps);
1435 m_Matches[i].m_End=sp-m_mBegin;
1436 break;
1437#ifdef _DEBUG
1438 default:// Uh.. oh..
1439 ASSERT(FALSE);
1440 break;
1441#endif
1442 }
1443 }
1444 ASSERT(sp==end);
1445 return sp;
1446}
1447
1448LPCTSTR CRegEx::MatchBackRef(LPCTSTR begin,LPCTSTR end,int from,int to,int level)
1449{
1450LPCTSTR sp = begin;
1451BOOL hard = FALSE;
1452 // Get as far as we can as long as it's easy
1453 for(int ss=from;!hard && ss<to;ss++){
1454 CSop s = m_Strip[ss];
1455 switch(s.m_Operator){
1456 case CSop::opChar:
1457 if(sp==end || *sp++ != s.m_Operand)
1458 return NULL;
1459 break;
1460 case CSop::opAny:
1461 if(sp==end)
1462 return NULL;
1463 sp++;// I'm sure this ++ could be embedded in previous expression..
1464 break;
1465 case CSop::opAnyOf:
1466 if(sp==end || !m_Sets[s.m_Operand].IsIn(*sp++))
1467 return NULL;
1468 break;
1469 case CSop::opBOL:
1470 if(!((sp==m_mBegin && !(m_mFlags&regNotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags&regNewLine))))
1471 return NULL;
1472 break;
1473 case CSop::opEOL:
1474 if(!((sp==m_mEnd && !(m_mFlags&regNotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags&regNewLine))))
1475 return NULL;
1476 break;
1477 case CSop::opBOW:
1478 if(!(((sp==m_mBegin && !(m_mFlags&regNotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags&regNewLine)) || (sp>m_mBegin && !isWordableChar(*(sp-1)))) && (sp<m_mEnd && isWordableChar(*sp))))
1479 return NULL;
1480 break;
1481 case CSop::opEOW:
1482 if(!(((sp==m_mEnd && !(m_mFlags&regNotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags&regNewLine)) || (sp<m_mEnd && !isWordableChar(*sp))) && (sp>m_mBegin && isWordableChar(*(sp-1)))))
1483 return NULL;
1484 break;
1485 case CSop::opQuest1:
1486 break;
1487 case CSop::opOr0:// Matches null, but needs to skip
1488 ss++;
1489 s = m_Strip[ss];
1490 do{
1491 ASSERT(s.m_Operator==CSop::opOr1);
1492 ss+=s.m_Operand;
1493 }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1);
1494 // Now we should note that ss++ gets us past the Choice1..
1495 break;
1496 default:
1497 // Have to make a choice..
1498 hard=TRUE;
1499 break;
1500 }
1501 }
1502 if(!hard){// That was it..
1503 if(sp!=end)
1504 return NULL;
1505 return sp;
1506 }
1507 ss--;// Adjust for ther for's final increment..
1508 // Hard stuff.. is going on and on..
1509CSop s = m_Strip[ss];
1510int i, len, offsave;
1511int ssub,esub;
1512LPCTSTR ssp, dp;
1513 switch(s.m_Operator){
1514 case CSop::opBackRef0:// The vilest depths they say.. If I only knew what the 'viles' stands for..
1515 i = s.m_Operand;
1516 ASSERT(0<i && i<=m_Subexps);
1517 if(m_Matches[i].m_End<0)
1518 return NULL;
1519 ASSERT(m_Matches[i].m_Begin>=0);
1520 len = m_Matches[i].GetLength();
1521 ASSERT((end-m_mBegin)>=len);
1522 if(sp>end-len)
1523 return NULL;// Not enough left to match..
1524 ssp = m_mBegin+m_Matches[i].m_Begin;
1525 if(memcmp(sp,ssp,len))
1526 return NULL;
1527 while(m_Strip[ss]!=CSop(CSop::opBackRef1,i))
1528 ss++;
1529 return MatchBackRef(sp+len,end,ss+1,to,level-1);
1530 break;
1531 case CSop::opQuest0:// to null or not they wonder..
1532 dp = MatchBackRef(sp,end,ss+1,to,level);
1533 if(dp)
1534 return dp;// not..
1535 return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1);
1536 break;
1537 case CSop::opPlus0:
1538 ASSERT(m_mLastPos.GetSize());
1539 ASSERT(level+1 <= m_Pluses);
1540 m_mLastPos[level+1]=sp;
1541 return MatchBackRef(sp,end,ss+1,to,level+1);
1542 break;
1543 case CSop::opPlus1:
1544 if(sp == m_mLastPos[level])// Last pass matched null
1545 return MatchBackRef(sp,end,ss+1,to,level-1);
1546 // Try another pass..
1547 m_mLastPos[level]=sp;
1548 dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level);
1549 if(dp)
1550 return dp;
1551 return MatchBackRef(sp,end,ss+1,to,level-1);
1552 break;
1553 case CSop::opChoice0:// find the right one, ifany
1554 ssub = ss+1;
1555 esub = ss+s.m_Operand-1;
1556 ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
1557 for(;;){// Find first matching branch.
1558 dp = MatchBackRef(sp,end,ssub,esub,level);
1559 if(dp)
1560 return dp;
1561 // This one missed, try next one..
1562 if(m_Strip[esub].m_Operator==CSop::opChoice1)
1563 return NULL;// There is none..
1564 esub++;
1565 ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
1566 ssub=esub+1;
1567 esub+=m_Strip[esub].m_Operand;
1568 if(m_Strip[esub].m_Operator==CSop::opOr1)
1569 esub--;
1570 else
1571 ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
1572 }
1573 break;
1574 case CSop::opLeftParen:// Must undo assignment if rest fails..
1575 i = s.m_Operand;
1576 ASSERT(0<i && i<=m_Subexps);
1577 offsave = m_Matches[i].m_Begin;
1578 m_Matches[i].m_Begin = sp-m_mBegin;
1579 dp = MatchBackRef(sp,end,ss+1,to,level);
1580 if(dp)
1581 return dp;
1582 m_Matches[i].m_Begin=offsave;
1583 return NULL;
1584 break;
1585 case CSop::opRightParen: // Must undo assignment if rest fails..
1586 i = s.m_Operand;
1587 ASSERT(0<i && i<=m_Subexps);
1588 offsave = m_Matches[i].m_End;
1589 m_Matches[i].m_End = sp-m_mBegin;
1590 dp = MatchBackRef(sp,end,ss+1,to,level);
1591 if(dp)
1592 return dp;
1593 m_Matches[i].m_End = offsave;
1594 return NULL;
1595 break;
1596 #ifdef_DEBUG
1597 default:
1598 ASSERT(FALSE);
1599 break;
1600#endif
1601 }
1602 ASSERT(FALSE);
1603 return NULL;// Anyway - we can never get here..
1604}
1605
1606 #ifdef_DEBUG
1607void CRegEx::CSop::Dump(CDumpContext& dc)
1608{
1609 switch(m_Operator){
1610 case opEnd:
1611 dc << "end";
1612 break;
1613 case opChar:
1614 dc << "char('" << (char)m_Operand << "')";
1615 break;
1616 case opBOL:
1617 dc << "BOL";
1618 break;
1619 case opEOL:
1620 dc << "EOL";
1621 break;
1622 case opAny:
1623 dc << "any";
1624 break;
1625 case opAnyOf:
1626 dc << "anyOf(" << m_Operand << ")";
1627 break;
1628 case opBackRef0:
1629 dc << "[ backref(" << m_Operand << ")";
1630 break;
1631 case opBackRef1:
1632 dc << "] backref(" << m_Operand << ")";
1633 break;
1634 case opPlus0:
1635 dc << "[ + (" << m_Operand << ")";
1636 break;
1637 case opPlus1:
1638 dc << "] + (" << m_Operand << ")";
1639 break;
1640 case opQuest0:
1641 dc << "[ ? (" << m_Operand << ")";
1642 break;
1643 case opQuest1:
1644 dc << "] ? (" << m_Operand << ")";
1645 break;
1646 case opLeftParen:
1647 dc << "[ ( (" << m_Operand << ")";
1648 break;
1649 case opRightParen:
1650 dc << "] ) (" << m_Operand << ")";
1651 break;
1652 case opChoice0:
1653 dc << "[ choice (" << m_Operand << ")";
1654 break;
1655 case opOr0:
1656 dc << "[ | (" << m_Operand << ")";
1657 break;
1658 case opOr1:
1659 dc << "] | (" << m_Operand << ")";
1660 break;
1661 case opChoice1:
1662 dc << "] choice (" << m_Operand << ")";
1663 break;
1664 case opBOW:
1665 dc << "BOW";
1666 break;
1667 case opEOW:
1668 dc << "EOW";
1669 break;
1670 }
1671}
1672void CRegEx::DumpStrip(CDumpContext& dc)
1673{
1674 for(int tmp=0;tmp<m_Strip.GetSize();tmp++)
1675 dc << tmp << ": " << m_Strip[tmp] << ";\n";
1676}
1677#endif
1678
1679
1680CString CRegEx::GetMatch(int match)
1681{
1682CString rv;
1683 if(!m_Matches.GetSize())
1684 return rv;
1685 ASSERT(m_Matches[0].m_Begin<m_Input.GetLength() && m_Matches[0].m_End<=m_Input.GetLength());
1686 if(match==matchPreMatch)
1687 return m_Input.Left(m_Matches[0].m_Begin);
1688 if(match==matchPostMatch)
1689 return m_Input.Mid(m_Matches[0].m_End);
1690 if(match<0 || match >= m_Matches.GetSize())
1691 return rv;
1692 if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0)
1693 return rv;
1694 ASSERT(m_Matches[match].m_Begin<m_Input.GetLength() && m_Matches[match].m_End<=m_Input.GetLength());
1695 rv = m_Input.Mid(m_Matches[match].m_Begin,m_Matches[match].m_End-m_Matches[match].m_Begin);
1696 return rv;
1697}