#include "../stdafx.h" #include "RegEx.h" #define isWordableChar(c) (isalnum(c) || (c)=='_') BOOL CRegEx::Compile(LPCTSTR regex,int flags) { ASSERT(!((flags®Extended) && (flags®Literal))); m_Matches.RemoveAll(); m_Strip.RemoveAll(); m_Strip.SetSize(0,15); m_Pattern=regex; m_ParsePointer=0; m_Error=0; m_Sets.RemoveAll(); m_Flags=flags; m_iFlags=0; m_BOLs=m_EOLs=0; m_Subexps = 0; m_Categories=1; // 0 is 'everything else' m_bBackRefs=FALSE; memset(m_Category,0,sizeof(m_Category)); // Go ahead. m_Error || m_Strip.Add(CSop(CSop::opEnd)); if(flags®Extended){ ParseERE(); }else if(flags®Literal){ ParseLiteral(); }else{ ParseBRE(); } m_Error || m_Strip.Add(CSop(CSop::opEnd)); Categorize(); m_Strip.FreeExtra(); FigureMust(); m_Pluses=CountPluses(); if(m_iFlags&iflagsBad){ m_Error || (m_Error=regeAssert); // ??? point to nuls? ;-) } // We may wish to free some memory here if we're erroneous (ie. m_Error..) m_ParseParens.RemoveAll(); #ifdef _DEBUG if(m_Error){ CString tmp; tmp.Format("RE: ParseError: %d\n",m_Error); TRACE0(tmp); } // DumpStrip(afxDump); #endif return (m_bCompiled=(!m_Error)); } BOOL CRegEx::Match(LPCTSTR src,int flags) { if(!m_bCompiled) return FALSE; if(m_iFlags&iflagsBad) return FALSE; m_Input=src; m_mFlags=flags; m_mPointer=m_Input; m_mBegin=m_Input; m_mEnd=&m_mBegin[m_Input.GetLength()]; ASSERT(m_mPointer<=m_mEnd); m_Matches.RemoveAll(); if(!m_Must.IsEmpty()){ if(m_Input.Find(m_Must)<0) return FALSE; } // Go ahead.. int stripLen = m_Strip.GetSize(); m_mLastPos.SetSize(0); for(int tmp=0;tmp0 && !m_mLastPos.GetSize()) m_mLastPos.SetSize(m_Pluses); dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); } if(dp) break; // Uh.. oh.. we couldn't find a subexpression-level match ASSERT(m_bBackRefs); ASSERT(m_Pluses==0 || m_mLastPos.GetSize()); for(;;){ if(dp || endp <= m_cOldP) break; // defeat.. ? endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1); if(!endp) break; // defeat.. ? // Try it on a shorter possibility.. #ifdef _DEBUG for(tmp=1;tmp<=m_Subexps;tmp++) ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0); #endif dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); } ASSERT((!dp) || dp==endp); if(dp) // Found a shorter one.. break; // Despite initial appearances, there is no match here beginp = m_cOldP+1; ASSERT(beginp<=m_mEnd); } // Fill in the detail if so requested.. if(!(m_mFlags®NoSubExpressions)){ if(!m_Matches.GetSize()) m_Matches.SetSize(1); m_Matches[0].m_Begin=m_cOldP-m_mBegin; m_Matches[0].m_End=endp-m_mBegin; } m_mLastPos.RemoveAll(); return TRUE; } CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags) { // *** return CString(); } void CRegEx::ParseERE(int stop) { UCHAR c; BOOL first=TRUE; int prevF, prevB; for(;;){ int co = m_Strip.GetSize(); while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop) ParseEREexp(); if(m_Strip.GetSize()==co){ m_Error || (m_Error=regeEmpty); // ??? point to nuls? } if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|') break; else m_ParsePointer++; if(first){ StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1)); prevF = prevB = co; first=FALSE; } m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB)); prevB = m_Strip.GetSize()-1; m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); prevF = m_Strip.GetSize(); m_Error || m_Strip.Add(CSop(CSop::opOr1,0)); // offset isn't really right.. very so.. } if(!first){ m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB)); } ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop); } void CRegEx::ParseEREexp() { ASSERT(m_ParsePointer < m_Pattern.GetLength()); UCHAR c = m_Pattern[m_ParsePointer++]; int pos = m_Strip.GetSize(); int subno; int count, count2; BOOL wascaret=FALSE; switch(c){ case '(': if(!(m_ParsePointer=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')') ParseERE(')'); VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize()); m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno)); if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){ TRACE0("RE: No matching ')'\n"); if(!m_Error) m_Error = regeParen; // ??? point to nuls? } break; case '^': m_Error || m_Strip.Add(CSop(CSop::opBOL)); m_iFlags|=iflagsUseBOL; m_BOLs++; wascaret=TRUE; break; case '$': m_Error || m_Strip.Add(CSop(CSop::opEOL)); m_iFlags|=iflagsUseEOL; m_EOLs++; break; case '|': TRACE0("RE: '|' outside of expression\n"); if(!m_Error) m_Error = regeEmpty; // ??? point to nuls? break; case '*': case '+': case '?': TRACE0("RE: '*'/'+'/'?' with no previous expression\n"); if(!m_Error) m_Error = regeBadRepeat; // ??? point to nuls? break; case '.': if(m_Flags®NewLine) EmitNonNewLineAny(); else m_Error || m_Strip.Add(CSop(CSop::opAny)); break; case '[': ParseBracket(); break; case '\\': if(m_ParsePointer >= m_Pattern.GetLength()){ TRACE0("RE: '\\' at the end of the pattern\n"); if(!m_Error) m_Error = regeEscape; // ??? point to nuls? }else{ c = m_Pattern[m_ParsePointer++]; EmitOrdinary(c); } break; case '{': if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){ TRACE0("RE: '{' with no repeat count\n"); if(!m_Error) m_Error = regeBadRepeat; // ??? point to nuls? } // Fallthrough.. default: EmitOrdinary(c); break; } if(m_ParsePointer >= m_Pattern.GetLength()) return; c = m_Pattern[m_ParsePointer]; // Call a '{' repetition if followed by a digit if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) )) return; // No repetitor - done. m_ParsePointer++; if(wascaret){ TRACE0("RE: repetitive '^' detected\n"); if(!m_Error) m_Error = regeBadRepeat; // ??? point to nuls? } switch(c){ case '*': // Implemeted as +? // + expression StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); // ? expression StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1)); m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos)); break; case '+': // + expression StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); break; case '?': // Kludge - emit y? as (y|) until subtle bug gets fixed :-) StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1)); m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos)); m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos); m_Error || m_Strip.Add(CSop(CSop::opOr1,1)); m_Error || m_Strip.Add(CSop(CSop::opChoice1,2)); break; case '{': count = ParseCount(); if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){ m_ParsePointer++; if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available count2=ParseCount(); if(!(count<=count2)){ TRACE0("RE: Disbalanced counts in '{}' repeater\n"); m_Error || (m_Error=regeBadBrace); // ??? point to nuls? } }else // Single number with comma count2=256; // Infinity }else // Single number count2=count; EmitRepeat(pos,count,count2); if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){ // No '}'.. TRACE0("RE: No immediately following '}' of '{' expression\n"); while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}') m_ParsePointer++; if(m_ParsePointer >= m_Pattern.GetLength()){ TRACE0("RE: No closing '}' found\n"); m_Error || (m_Error=regeBrace); }else m_Error || (m_Error=regeBadBrace); // ??? point to nuls? }else m_ParsePointer++; break; } if(m_ParsePointer >= m_Pattern.GetLength()) return; c = m_Pattern[m_ParsePointer]; if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)=pos) m_ParseParens[tmp].m_Begin++; if(m_ParseParens[tmp].m_End>=pos) m_ParseParens[tmp].m_End++; } } void CRegEx::EmitOrdinary(UCHAR c) { if(m_Flags®IgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){ // Emit both cases CString savePattern = m_Pattern; int savePointer = m_ParsePointer; m_Pattern=c; m_Pattern+=']'; m_ParsePointer=0; ParseBracket(); m_Pattern=savePattern; m_ParsePointer=savePointer; }else{ m_Error || m_Strip.Add(CSop(CSop::opChar,c)); if(!m_Category[(BYTE)c]) m_Category[(BYTE)c]=m_Categories++; } } void CRegEx::EmitNonNewLineAny() { // Kludges're going on and on.. CString savePattern = m_Pattern; int savePointer = m_ParsePointer; m_Pattern="^\n]"; m_ParsePointer=0; ParseBracket(); m_Pattern=savePattern; m_ParsePointer=savePointer; } int CRegEx::ParseCount() { BOOL nonEmpty=FALSE; int rv = 0; UCHAR c; while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){ rv = rv*10 + c-'0'; nonEmpty=TRUE; m_ParsePointer++; } if(rv>255 || !nonEmpty){ m_Error || (m_Error=regeBadBrace); // ??? point to nuls? } return rv; } void CRegEx::ParseBracket() { // Dept. of truly sickening special case kludges if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){ m_Error || m_Strip.Add(CSop(CSop::opBOW)); m_ParsePointer+=6; return; } if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){ m_Error || m_Strip.Add(CSop(CSop::opEOW)); m_ParsePointer+=6; return; } BOOL invert=TRUE; if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^') m_ParsePointer++; else invert=FALSE; CSet cset; if(m_ParsePointer < m_Pattern.GetLength()){ switch(m_Pattern[m_ParsePointer]){ case ']': case '-': cset.Add(m_Pattern[m_ParsePointer]); m_ParsePointer++; break; } } while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)=0;tmp--){ if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp))) cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp)); } /* if(!cset->m_Multis.IsEmpty()) cset.CollatingCase(); */ } if(invert){ for(int tmp=CSet::size-1;tmp>=0;tmp--) if(cset.IsIn(tmp)) cset.Sub(tmp); else cset.Add(tmp); if(m_Flags®NewLine) cset.Sub('\n'); /* if(!cset.m_Multis.IsEmpty()) cset.CollatingInvert(); */ } UCHAR c = cset.GetOnly(); if(c){ EmitOrdinary(c); }else m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset))); } void CRegEx::CSet::Add(UCHAR c) { m_Set[(BYTE)c]=TRUE; m_Hash+=c; } BOOL CRegEx::CSet::IsIn(UCHAR c) { return m_Set[(BYTE)c]; } void CRegEx::CSet::Sub(UCHAR c) { m_Set[(BYTE)c]=FALSE; m_Hash-=c; } UCHAR CRegEx::CSet::GetOnly() { int rv = 0; UCHAR only = 0; for(int tmp=0;tmp=m_Pattern.GetLength()){ m_Error || (m_Error=regeBracket); // ??? point to nuls? } c = m_Pattern[m_ParsePointer]; if(c== '-' || c==']'){ m_Error || (m_Error=regeCType); // ??? point to nuls? } ParseBracketCClass(cset); if(m_ParsePointer>=m_Pattern.GetLength()){ m_Error || (m_Error=regeBracket); // ??? point to nuls? } if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){ m_Error || (m_Error=regeCType); // ??? point to nuls? }else m_ParsePointer+=2; break; case '=': // Equivalence class m_ParsePointer+=2; if(m_ParsePointer >= m_Pattern.GetLength()){ m_Error || (m_Error=regeBracket); // ??? point to nuls? } c = m_Pattern[m_ParsePointer]; if(c== '-' || c==']'){ m_Error || (m_Error=regeCollate); // ??? point to nuls? } ParseBracketEClass(cset); if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){ m_Error || (m_Error=regeCollate); // ??? point to nuls? }else m_ParsePointer+=2; break; default: // Symbol, character or range { UCHAR start, finish; start = ParseBracketSymbol(); if((m_ParsePointer((BYTE)finish)){ m_Error || (m_Error=regeRange); // ??? point to nuls? } for(int tmp=start;tmp<=(BYTE)finish;tmp++) cset.Add(tmp); } break; } } void CRegEx::ParseBracketCClass(CSet& cset) { static struct { char *className; char *classChars; } cc[] = { {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"}, {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"}, {"blank"," \t"}, {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"}, {"digit","0123456789"}, {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, {"lower","abcdefghijklmnopqrstuvwxyz"}, {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "}, {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, {"space","\t\n\v\f\r "}, {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, {"xdigit","0123456789ABCDEFabcdef"} }; CString cclass; UCHAR c; while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){ cclass+=c; m_ParsePointer++; } char *classChars = NULL; for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){ if(!cclass.CompareNoCase(cc[tmp].className)){ classChars=cc[tmp].classChars; break; } } if(!classChars){ m_Error || (m_Error=regeCType); // ??? point to nuls? return; } while(*classChars) cset.Add(*(classChars++)); // --- multis } void CRegEx::ParseBracketEClass(CSet& cset) { cset.Add(ParseBracketCollatingElement('='));; } UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term) { static struct { char *entityName; char entity; } cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} }; CString seeTwo; seeTwo=term; seeTwo+=']'; CString entityName; while(m_ParsePointer=m_Pattern.GetLength()){ m_Error || (m_Error=regeBracket); // ??? point to nuls? return 0; } for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++) if(!entityName.CompareNoCase(cc[tmp].entityName)) return cc[tmp].entity; if(entityName.GetLength()==1) return entityName[0]; m_Error || (m_Error=regeCollate); // ??? point to nuls? return 0; } UCHAR CRegEx::ParseBracketSymbol() { if(m_ParsePointer>=m_Pattern.GetLength()){ m_Error || (m_Error=regeBracket); // ??? point to nuls? } if((m_ParsePointer+1)mustLength){ mustStart=seqStart; mustLength=seqLength; } seqLength=0; break; } } // Hmm.. originally it's meant to be do while not opEnd.. if(!mustLength) return; // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not.. for(tmp=0;tmprv) rv = nest; nest--; break; } } // Again, originally we were supposed to scan till opEnd.. if(nest) m_iFlags|=iflagsBad; // Could this be an ASSERTion? return rv; } void CRegEx::ParseLiteral() { if(!m_Pattern.GetLength()){ m_Error || (m_Error=regeEmpty); // ??? point to nuls? } while(m_ParsePointer < m_Pattern.GetLength()) EmitOrdinary(m_Pattern[m_ParsePointer++]); } void CRegEx::ParseBRE(int stopa,int stopb) { int start = m_Strip.GetSize(); BOOL first=TRUE; BOOL wasDollar=FALSE; if(m_ParsePointer=m_Pattern.GetLength()){ m_Error || (m_Error=regeEscape); // ??? point to nuls }else c = 0x100|m_Pattern[m_ParsePointer++]; } switch(c){ case '.': if(m_Flags®NewLine) EmitNonNewLineAny(); else m_Error || m_Strip.Add(CSop(CSop::opAny,0)); break; case '[': ParseBracket(); break; case 0x100|'{': m_Error || (m_Error=regeBadRepeat); // ??? point to nuls? break; case 0x100|'(': m_Subexps++; subno=m_Subexps; m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize())); m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno)); if(m_ParsePointercount2){ m_Error || (m_Error=regeBadBrace); // ??? poin to nuls? } }else // Single number with comma count2=256; }else // Single number count2=count; EmitRepeat(pos,count,count2); if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){ while(m_ParsePointer=m_Pattern.GetLength()){ m_Error || (m_Error=regeBrace); // ??? point to nuls? } m_Error || (m_Error=regeBadBrace); }else m_ParsePointer+=2; }else if(c=='$') return TRUE; return FALSE; } CRegEx::CRegEx() { m_bCompiled=FALSE; } LPCTSTR CRegEx::MatchFast(LPCTSTR begin) { MatchStatesClear(CSop::stCurrent); m_Strip[1].m_MatchData|=CSop::stCurrent; int stripLen = m_Strip.GetSize(); MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent); MatchStatesCopy(CSop::stFresh,CSop::stCurrent); LPCTSTR coldp = NULL; LPCTSTR p = begin; int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]); for(;;){ // next character.. int lastc = c; c = (p==m_mEnd)?charOut:(int)*(BYTE*)p; if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh)) coldp=p; // Is there an EOL and/or BOL between lastc and c? - they ask.. int flagc=0; int i = 0; if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){ flagc=charBOL; i=m_BOLs; } if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){ flagc=(flagc==charBOL)?charBOLEOL:charEOL; i+=m_EOLs; } if(i){ for(;i>0;i--) MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); } // What about a word boundary? - they wonder.. if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c))) flagc = charBOW; if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) flagc = charEOW; if(flagc==charBOW || flagc==charEOW){ MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); } // Are we done? Now WE wonder.. if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd) break; // They insist I need to note break out.. Okay, I do.. // Nope, we're not done. We have to face this character.. MatchStatesCopy(CSop::stTemp,CSop::stCurrent); MatchStatesCopy(CSop::stCurrent,CSop::stFresh); ASSERT(c!=charOut); MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent); p++; } ASSERT(coldp); m_cOldP=coldp; // *** I believe this variable can be changed 'in-place' if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) return &p[1]; else return NULL; } void CRegEx::MatchStatesClear(BYTE mask) { int stripLen = m_Strip.GetSize(); for(int tmp=0;tmp0;i--) MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); } // Now we wonder about word boundaries.. if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c))) flagc=charBOW; if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) flagc=charEOW; if(flagc==charBOW || flagc==charEOW){ MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); } // Are we done we all wonder?? if(m_Strip[to].m_MatchData&CSop::stCurrent) mp=p; if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end) break; // Again, we're obliged to note break out. We do. // Sucks.. we have to face this character.. MatchStatesCopy(CSop::stTemp,CSop::stCurrent); MatchStatesCopy(CSop::stCurrent,CSop::stEmpty); ASSERT(c!=charOut); MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent); MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent); p++; } return mp; } LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to) { LPCTSTR sp = begin, dp; LPCTSTR stp, rest, tail, ssp, oldssp, sep; int ssub, esub; int es; int i; for(int ss=from;ss=sp); // It did work.. It should've.. } ssub=ss+1; esub=es-1; // Did innards match? if(MatchSlow(sp,rest,ssub,esub)){ dp = MatchDissect(sp,rest,ssub,esub); ASSERT(dp==rest); }else // nope.. ASSERT(sp==rest); sp = rest; break; case CSop::opPlus0: stp=end; for(;;){ // How long could this one be?? rest = MatchSlow(sp,stp,ss,es); ASSERT(rest); // It did.. It should've.. // Could the rest match the rest? tail = MatchSlow(rest,end,es,to); if(tail==end) break; // yup. // nope.. stp=rest-1; ASSERT(stp>=sp); // It should've worked, we believe.. } ssub=ss+1; esub=es-1; ssp=sp; oldssp=ssp; for(;;){ // Find last match of innards.. sep = MatchSlow(ssp,rest,ssub,esub); if((!sep) || sep==ssp) break; // Failed or matched nothin' oldssp=ssp; ssp=sep; } if(!sep){ // Last successfull.. sep=ssp; ssp=oldssp; } ASSERT(sep=rest); // Must exhaust substring they say.. VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest); // Assert or verify - that is the question.. dp = MatchDissect(ssp,sep,ssub,esub); ASSERT(dp==sep); sp=rest; break; case CSop::opChoice0: stp = end; for(;;){ // how long.. rest = MatchSlow(sp,stp,ss,es); ASSERT(rest); // Could it.. tail = MatchSlow(rest,end,es,to); if(tail==end) break; // y // n stp = rest-1; ASSERT(stp>=sp); } ssub=ss+1; esub=ss+m_Strip[ss].m_Operand-1; ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); for(;;){ // Find first matching branch.. if(MatchSlow(sp,rest,ssub,esub)==rest) break; // this one missed, try next.. ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); esub++; ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); ssub=esub+1; esub+=m_Strip[esub].m_Operand; if(m_Strip[esub].m_Operator==CSop::opOr1) esub--; else ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); } dp = MatchDissect(sp,rest,ssub,esub); ASSERT(dp==rest); sp=rest; break; case CSop::opPlus1: case CSop::opQuest1: case CSop::opOr0: case CSop::opOr1: case CSop::opChoice1: ASSERT(FALSE); break; case CSop::opLeftParen: i = m_Strip[ss].m_Operand; ASSERT(0m_mBegin && !isWordableChar(*(sp-1)))) && (spm_mBegin && isWordableChar(*(sp-1))))) return NULL; break; case CSop::opQuest1: break; case CSop::opOr0: // Matches null, but needs to skip ss++; s = m_Strip[ss]; do{ ASSERT(s.m_Operator==CSop::opOr1); ss+=s.m_Operand; }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1); // Now we should note that ss++ gets us past the Choice1.. break; default: // Have to make a choice.. hard=TRUE; break; } } if(!hard){ // That was it.. if(sp!=end) return NULL; return sp; } ss--; // Adjust for ther for's final increment.. // Hard stuff.. is going on and on.. CSop s = m_Strip[ss]; int i, len, offsave; int ssub,esub; LPCTSTR ssp, dp; switch(s.m_Operator){ case CSop::opBackRef0: // The vilest depths they say.. If I only knew what the 'viles' stands for.. i = s.m_Operand; ASSERT(0=0); len = m_Matches[i].GetLength(); ASSERT((end-m_mBegin)>=len); if(sp>end-len) return NULL; // Not enough left to match.. ssp = m_mBegin+m_Matches[i].m_Begin; if(memcmp(sp,ssp,len)) return NULL; while(m_Strip[ss]!=CSop(CSop::opBackRef1,i)) ss++; return MatchBackRef(sp+len,end,ss+1,to,level-1); break; case CSop::opQuest0: // to null or not they wonder.. dp = MatchBackRef(sp,end,ss+1,to,level); if(dp) return dp; // not.. return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1); break; case CSop::opPlus0: ASSERT(m_mLastPos.GetSize()); ASSERT(level+1 <= m_Pluses); m_mLastPos[level+1]=sp; return MatchBackRef(sp,end,ss+1,to,level+1); break; case CSop::opPlus1: if(sp == m_mLastPos[level]) // Last pass matched null return MatchBackRef(sp,end,ss+1,to,level-1); // Try another pass.. m_mLastPos[level]=sp; dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level); if(dp) return dp; return MatchBackRef(sp,end,ss+1,to,level-1); break; case CSop::opChoice0: // find the right one, ifany ssub = ss+1; esub = ss+s.m_Operand-1; ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); for(;;){ // Find first matching branch. dp = MatchBackRef(sp,end,ssub,esub,level); if(dp) return dp; // This one missed, try next one.. if(m_Strip[esub].m_Operator==CSop::opChoice1) return NULL; // There is none.. esub++; ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); ssub=esub+1; esub+=m_Strip[esub].m_Operand; if(m_Strip[esub].m_Operator==CSop::opOr1) esub--; else ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); } break; case CSop::opLeftParen: // Must undo assignment if rest fails.. i = s.m_Operand; ASSERT(0= m_Matches.GetSize()) return rv; if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0) return rv; ASSERT(m_Matches[match].m_Begin