From a1487b3fc0313408525cd5b2f3bc4a462df351f7 Mon Sep 17 00:00:00 2001 From: Michael Krelin Date: Mon, 05 Jul 2004 01:53:09 +0000 Subject: initial commit into svn repository git-svn-id: http://svn.klever.net/kin/klog/trunk@1 fe716a7a-6dde-0310-88d9-d003556173a8 --- (limited to 'shared-code/RegEx.cpp') diff --git a/shared-code/RegEx.cpp b/shared-code/RegEx.cpp new file mode 100644 index 0000000..b7bab62 --- a/dev/null +++ b/shared-code/RegEx.cpp @@ -0,0 +1,1697 @@ +#include "../stdafx.h" +#include "RegEx.h" + +#define isWordableChar(c) (isalnum(c) || (c)=='_') + +BOOL CRegEx::Compile(LPCTSTR regex,int flags) +{ + ASSERT(!((flags®Extended) && (flags®Literal))); + m_Matches.RemoveAll(); + m_Strip.RemoveAll(); + m_Strip.SetSize(0,15); + m_Pattern=regex; + m_ParsePointer=0; + m_Error=0; + m_Sets.RemoveAll(); + m_Flags=flags; + m_iFlags=0; + m_BOLs=m_EOLs=0; + m_Subexps = 0; + m_Categories=1; // 0 is 'everything else' + m_bBackRefs=FALSE; + memset(m_Category,0,sizeof(m_Category)); + + // Go ahead. + m_Error || m_Strip.Add(CSop(CSop::opEnd)); + if(flags®Extended){ + ParseERE(); + }else if(flags®Literal){ + ParseLiteral(); + }else{ + ParseBRE(); + } + m_Error || m_Strip.Add(CSop(CSop::opEnd)); + Categorize(); + m_Strip.FreeExtra(); + FigureMust(); + m_Pluses=CountPluses(); + if(m_iFlags&iflagsBad){ + m_Error || (m_Error=regeAssert); + // ??? point to nuls? ;-) + } + // We may wish to free some memory here if we're erroneous (ie. m_Error..) + m_ParseParens.RemoveAll(); +#ifdef _DEBUG + if(m_Error){ + CString tmp; + tmp.Format("RE: ParseError: %d\n",m_Error); + TRACE0(tmp); + } +// DumpStrip(afxDump); +#endif + return (m_bCompiled=(!m_Error)); +} + +BOOL CRegEx::Match(LPCTSTR src,int flags) +{ + if(!m_bCompiled) + return FALSE; + if(m_iFlags&iflagsBad) + return FALSE; + m_Input=src; + m_mFlags=flags; + m_mPointer=m_Input; + m_mBegin=m_Input; + m_mEnd=&m_mBegin[m_Input.GetLength()]; + ASSERT(m_mPointer<=m_mEnd); + m_Matches.RemoveAll(); + if(!m_Must.IsEmpty()){ + if(m_Input.Find(m_Must)<0) + return FALSE; + } + // Go ahead.. +int stripLen = m_Strip.GetSize(); + m_mLastPos.SetSize(0); + for(int tmp=0;tmp0 && !m_mLastPos.GetSize()) + m_mLastPos.SetSize(m_Pluses); + dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); + } + if(dp) + break; + // Uh.. oh.. we couldn't find a subexpression-level match + ASSERT(m_bBackRefs); + ASSERT(m_Pluses==0 || m_mLastPos.GetSize()); + for(;;){ + if(dp || endp <= m_cOldP) + break; // defeat.. ? + endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1); + if(!endp) + break; // defeat.. ? + // Try it on a shorter possibility.. +#ifdef _DEBUG + for(tmp=1;tmp<=m_Subexps;tmp++) + ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0); +#endif + dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0); + } + ASSERT((!dp) || dp==endp); + if(dp) // Found a shorter one.. + break; + // Despite initial appearances, there is no match here + beginp = m_cOldP+1; + ASSERT(beginp<=m_mEnd); + } + // Fill in the detail if so requested.. + if(!(m_mFlags®NoSubExpressions)){ + if(!m_Matches.GetSize()) + m_Matches.SetSize(1); + m_Matches[0].m_Begin=m_cOldP-m_mBegin; + m_Matches[0].m_End=endp-m_mBegin; + } + m_mLastPos.RemoveAll(); + return TRUE; +} + +CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags) +{ + // *** + return CString(); +} + +void CRegEx::ParseERE(int stop) +{ +UCHAR c; +BOOL first=TRUE; +int prevF, prevB; + for(;;){ + int co = m_Strip.GetSize(); + while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop) + ParseEREexp(); + if(m_Strip.GetSize()==co){ + m_Error || (m_Error=regeEmpty); + // ??? point to nuls? + } + if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|') + break; + else + m_ParsePointer++; + if(first){ + StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1)); + prevF = prevB = co; + first=FALSE; + } + m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB)); + prevB = m_Strip.GetSize()-1; + m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); + prevF = m_Strip.GetSize(); + m_Error || m_Strip.Add(CSop(CSop::opOr1,0)); // offset isn't really right.. very so.. + } + if(!first){ + m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF); + m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB)); + } + ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop); +} + +void CRegEx::ParseEREexp() +{ + ASSERT(m_ParsePointer < m_Pattern.GetLength()); +UCHAR c = m_Pattern[m_ParsePointer++]; +int pos = m_Strip.GetSize(); +int subno; +int count, count2; +BOOL wascaret=FALSE; + switch(c){ + case '(': + if(!(m_ParsePointer=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')') + ParseERE(')'); + VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize()); + m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno)); + if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){ + TRACE0("RE: No matching ')'\n"); + if(!m_Error) + m_Error = regeParen; + // ??? point to nuls? + } + break; + case '^': + m_Error || m_Strip.Add(CSop(CSop::opBOL)); + m_iFlags|=iflagsUseBOL; + m_BOLs++; + wascaret=TRUE; + break; + case '$': + m_Error || m_Strip.Add(CSop(CSop::opEOL)); + m_iFlags|=iflagsUseEOL; + m_EOLs++; + break; + case '|': + TRACE0("RE: '|' outside of expression\n"); + if(!m_Error) + m_Error = regeEmpty; + // ??? point to nuls? + break; + case '*': + case '+': + case '?': + TRACE0("RE: '*'/'+'/'?' with no previous expression\n"); + if(!m_Error) + m_Error = regeBadRepeat; + // ??? point to nuls? + break; + case '.': + if(m_Flags®NewLine) + EmitNonNewLineAny(); + else + m_Error || m_Strip.Add(CSop(CSop::opAny)); + break; + case '[': + ParseBracket(); + break; + case '\\': + if(m_ParsePointer >= m_Pattern.GetLength()){ + TRACE0("RE: '\\' at the end of the pattern\n"); + if(!m_Error) + m_Error = regeEscape; + // ??? point to nuls? + }else{ + c = m_Pattern[m_ParsePointer++]; + EmitOrdinary(c); + } + break; + case '{': + if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){ + TRACE0("RE: '{' with no repeat count\n"); + if(!m_Error) + m_Error = regeBadRepeat; + // ??? point to nuls? + } + // Fallthrough.. + default: + EmitOrdinary(c); + break; + } + if(m_ParsePointer >= m_Pattern.GetLength()) + return; + c = m_Pattern[m_ParsePointer]; + // Call a '{' repetition if followed by a digit + if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) )) + return; // No repetitor - done. + m_ParsePointer++; + if(wascaret){ + TRACE0("RE: repetitive '^' detected\n"); + if(!m_Error) + m_Error = regeBadRepeat; + // ??? point to nuls? + } + switch(c){ + case '*': // Implemeted as +? + // + expression + StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); + m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); + // ? expression + StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1)); + m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos)); + break; + case '+': + // + expression + StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1)); + m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos)); + break; + case '?': + // Kludge - emit y? as (y|) until subtle bug gets fixed :-) + StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1)); + m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos)); + m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos); + m_Error || m_Strip.Add(CSop(CSop::opOr1,1)); + m_Error || m_Strip.Add(CSop(CSop::opChoice1,2)); + break; + case '{': + count = ParseCount(); + if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){ + m_ParsePointer++; + if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available + count2=ParseCount(); + if(!(count<=count2)){ + TRACE0("RE: Disbalanced counts in '{}' repeater\n"); + m_Error || (m_Error=regeBadBrace); + // ??? point to nuls? + } + }else // Single number with comma + count2=256; // Infinity + }else // Single number + count2=count; + EmitRepeat(pos,count,count2); + if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){ + // No '}'.. + TRACE0("RE: No immediately following '}' of '{' expression\n"); + while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}') + m_ParsePointer++; + if(m_ParsePointer >= m_Pattern.GetLength()){ + TRACE0("RE: No closing '}' found\n"); + m_Error || (m_Error=regeBrace); + }else + m_Error || (m_Error=regeBadBrace); + // ??? point to nuls? + }else + m_ParsePointer++; + break; + } + if(m_ParsePointer >= m_Pattern.GetLength()) + return; + c = m_Pattern[m_ParsePointer]; + if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)=pos) + m_ParseParens[tmp].m_Begin++; + if(m_ParseParens[tmp].m_End>=pos) + m_ParseParens[tmp].m_End++; + } +} + +void CRegEx::EmitOrdinary(UCHAR c) +{ + if(m_Flags®IgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){ + // Emit both cases + CString savePattern = m_Pattern; + int savePointer = m_ParsePointer; + m_Pattern=c; + m_Pattern+=']'; + m_ParsePointer=0; + ParseBracket(); + m_Pattern=savePattern; + m_ParsePointer=savePointer; + }else{ + m_Error || m_Strip.Add(CSop(CSop::opChar,c)); + if(!m_Category[(BYTE)c]) + m_Category[(BYTE)c]=m_Categories++; + } +} + +void CRegEx::EmitNonNewLineAny() +{ + // Kludges're going on and on.. +CString savePattern = m_Pattern; +int savePointer = m_ParsePointer; + m_Pattern="^\n]"; + m_ParsePointer=0; + ParseBracket(); + m_Pattern=savePattern; + m_ParsePointer=savePointer; +} + +int CRegEx::ParseCount() +{ +BOOL nonEmpty=FALSE; +int rv = 0; +UCHAR c; + while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){ + rv = rv*10 + c-'0'; + nonEmpty=TRUE; + m_ParsePointer++; + } + if(rv>255 || !nonEmpty){ + m_Error || (m_Error=regeBadBrace); + // ??? point to nuls? + } + return rv; +} + +void CRegEx::ParseBracket() +{ + // Dept. of truly sickening special case kludges + if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){ + m_Error || m_Strip.Add(CSop(CSop::opBOW)); + m_ParsePointer+=6; + return; + } + if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){ + m_Error || m_Strip.Add(CSop(CSop::opEOW)); + m_ParsePointer+=6; + return; + } +BOOL invert=TRUE; + if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^') + m_ParsePointer++; + else + invert=FALSE; +CSet cset; + if(m_ParsePointer < m_Pattern.GetLength()){ + switch(m_Pattern[m_ParsePointer]){ + case ']': + case '-': + cset.Add(m_Pattern[m_ParsePointer]); + m_ParsePointer++; + break; + } + } + while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)=0;tmp--){ + if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp))) + cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp)); + } + /* + if(!cset->m_Multis.IsEmpty()) + cset.CollatingCase(); + */ + } + if(invert){ + for(int tmp=CSet::size-1;tmp>=0;tmp--) + if(cset.IsIn(tmp)) + cset.Sub(tmp); + else + cset.Add(tmp); + if(m_Flags®NewLine) + cset.Sub('\n'); + /* + if(!cset.m_Multis.IsEmpty()) + cset.CollatingInvert(); + */ + } +UCHAR c = cset.GetOnly(); + if(c){ + EmitOrdinary(c); + }else + m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset))); +} + +void CRegEx::CSet::Add(UCHAR c) +{ + m_Set[(BYTE)c]=TRUE; + m_Hash+=c; +} + +BOOL CRegEx::CSet::IsIn(UCHAR c) +{ + return m_Set[(BYTE)c]; +} + +void CRegEx::CSet::Sub(UCHAR c) +{ + m_Set[(BYTE)c]=FALSE; + m_Hash-=c; +} + +UCHAR CRegEx::CSet::GetOnly() +{ +int rv = 0; +UCHAR only = 0; + for(int tmp=0;tmp=m_Pattern.GetLength()){ + m_Error || (m_Error=regeBracket); + // ??? point to nuls? + } + c = m_Pattern[m_ParsePointer]; + if(c== '-' || c==']'){ + m_Error || (m_Error=regeCType); + // ??? point to nuls? + } + ParseBracketCClass(cset); + if(m_ParsePointer>=m_Pattern.GetLength()){ + m_Error || (m_Error=regeBracket); + // ??? point to nuls? + } + if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){ + m_Error || (m_Error=regeCType); + // ??? point to nuls? + }else + m_ParsePointer+=2; + break; + case '=': // Equivalence class + m_ParsePointer+=2; + if(m_ParsePointer >= m_Pattern.GetLength()){ + m_Error || (m_Error=regeBracket); + // ??? point to nuls? + } + c = m_Pattern[m_ParsePointer]; + if(c== '-' || c==']'){ + m_Error || (m_Error=regeCollate); + // ??? point to nuls? + } + ParseBracketEClass(cset); + if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){ + m_Error || (m_Error=regeCollate); + // ??? point to nuls? + }else + m_ParsePointer+=2; + break; + default: // Symbol, character or range + { + UCHAR start, finish; + start = ParseBracketSymbol(); + if((m_ParsePointer((BYTE)finish)){ + m_Error || (m_Error=regeRange); + // ??? point to nuls? + } + for(int tmp=start;tmp<=(BYTE)finish;tmp++) + cset.Add(tmp); + } + break; + } +} + +void CRegEx::ParseBracketCClass(CSet& cset) +{ +static struct { + char *className; + char *classChars; +} cc[] = { + {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"}, + {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"}, + {"blank"," \t"}, + {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"}, + {"digit","0123456789"}, + {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, + {"lower","abcdefghijklmnopqrstuvwxyz"}, + {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "}, + {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"}, + {"space","\t\n\v\f\r "}, + {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"}, + {"xdigit","0123456789ABCDEFabcdef"} +}; +CString cclass; +UCHAR c; + while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){ + cclass+=c; + m_ParsePointer++; + } +char *classChars = NULL; + for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){ + if(!cclass.CompareNoCase(cc[tmp].className)){ + classChars=cc[tmp].classChars; + break; + } + } + if(!classChars){ + m_Error || (m_Error=regeCType); + // ??? point to nuls? + return; + } + while(*classChars) + cset.Add(*(classChars++)); + // --- multis +} + +void CRegEx::ParseBracketEClass(CSet& cset) +{ + cset.Add(ParseBracketCollatingElement('='));; +} + +UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term) +{ +static struct { + char *entityName; + char entity; +} cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} }; +CString seeTwo; + seeTwo=term; + seeTwo+=']'; +CString entityName; + while(m_ParsePointer=m_Pattern.GetLength()){ + m_Error || (m_Error=regeBracket); + // ??? point to nuls? + return 0; + } + for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++) + if(!entityName.CompareNoCase(cc[tmp].entityName)) + return cc[tmp].entity; + if(entityName.GetLength()==1) + return entityName[0]; + m_Error || (m_Error=regeCollate); + // ??? point to nuls? + return 0; +} + +UCHAR CRegEx::ParseBracketSymbol() +{ + if(m_ParsePointer>=m_Pattern.GetLength()){ + m_Error || (m_Error=regeBracket); + // ??? point to nuls? + } + if((m_ParsePointer+1)mustLength){ + mustStart=seqStart; + mustLength=seqLength; + } + seqLength=0; + break; + } + } // Hmm.. originally it's meant to be do while not opEnd.. + if(!mustLength) + return; + // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not.. + for(tmp=0;tmprv) + rv = nest; + nest--; + break; + } + } // Again, originally we were supposed to scan till opEnd.. + if(nest) + m_iFlags|=iflagsBad; // Could this be an ASSERTion? + return rv; +} + +void CRegEx::ParseLiteral() +{ + if(!m_Pattern.GetLength()){ + m_Error || (m_Error=regeEmpty); + // ??? point to nuls? + } + while(m_ParsePointer < m_Pattern.GetLength()) + EmitOrdinary(m_Pattern[m_ParsePointer++]); +} + +void CRegEx::ParseBRE(int stopa,int stopb) +{ +int start = m_Strip.GetSize(); +BOOL first=TRUE; +BOOL wasDollar=FALSE; + if(m_ParsePointer=m_Pattern.GetLength()){ + m_Error || (m_Error=regeEscape); + // ??? point to nuls + }else + c = 0x100|m_Pattern[m_ParsePointer++]; + } + switch(c){ + case '.': + if(m_Flags®NewLine) + EmitNonNewLineAny(); + else + m_Error || m_Strip.Add(CSop(CSop::opAny,0)); + break; + case '[': + ParseBracket(); + break; + case 0x100|'{': + m_Error || (m_Error=regeBadRepeat); + // ??? point to nuls? + break; + case 0x100|'(': + m_Subexps++; + subno=m_Subexps; + m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize())); + m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno)); + if(m_ParsePointercount2){ + m_Error || (m_Error=regeBadBrace); + // ??? poin to nuls? + } + }else // Single number with comma + count2=256; + }else // Single number + count2=count; + EmitRepeat(pos,count,count2); + if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){ + while(m_ParsePointer=m_Pattern.GetLength()){ + m_Error || (m_Error=regeBrace); + // ??? point to nuls? + } + m_Error || (m_Error=regeBadBrace); + }else + m_ParsePointer+=2; + }else if(c=='$') + return TRUE; + return FALSE; +} + +CRegEx::CRegEx() +{ + m_bCompiled=FALSE; +} + +LPCTSTR CRegEx::MatchFast(LPCTSTR begin) +{ + MatchStatesClear(CSop::stCurrent); + m_Strip[1].m_MatchData|=CSop::stCurrent; +int stripLen = m_Strip.GetSize(); + MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent); + MatchStatesCopy(CSop::stFresh,CSop::stCurrent); +LPCTSTR coldp = NULL; +LPCTSTR p = begin; +int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]); + for(;;){ + // next character.. + int lastc = c; + c = (p==m_mEnd)?charOut:(int)*(BYTE*)p; + if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh)) + coldp=p; + // Is there an EOL and/or BOL between lastc and c? - they ask.. + int flagc=0; + int i = 0; + if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){ + flagc=charBOL; + i=m_BOLs; + } + if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){ + flagc=(flagc==charBOL)?charBOLEOL:charEOL; + i+=m_EOLs; + } + if(i){ + for(;i>0;i--) + MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); + } + // What about a word boundary? - they wonder.. + if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c))) + flagc = charBOW; + if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) + flagc = charEOW; + if(flagc==charBOW || flagc==charEOW){ + MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent); + } + // Are we done? Now WE wonder.. + if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd) + break; // They insist I need to note break out.. Okay, I do.. + // Nope, we're not done. We have to face this character.. + MatchStatesCopy(CSop::stTemp,CSop::stCurrent); + MatchStatesCopy(CSop::stCurrent,CSop::stFresh); + ASSERT(c!=charOut); + MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent); + p++; + } + ASSERT(coldp); + m_cOldP=coldp; // *** I believe this variable can be changed 'in-place' + if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) + return &p[1]; + else + return NULL; +} + +void CRegEx::MatchStatesClear(BYTE mask) +{ +int stripLen = m_Strip.GetSize(); + for(int tmp=0;tmp0;i--) + MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); + } + // Now we wonder about word boundaries.. + if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c))) + flagc=charBOW; + if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c)))) + flagc=charEOW; + if(flagc==charBOW || flagc==charEOW){ + MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent); + } + // Are we done we all wonder?? + if(m_Strip[to].m_MatchData&CSop::stCurrent) + mp=p; + if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end) + break; // Again, we're obliged to note break out. We do. + // Sucks.. we have to face this character.. + MatchStatesCopy(CSop::stTemp,CSop::stCurrent); + MatchStatesCopy(CSop::stCurrent,CSop::stEmpty); + ASSERT(c!=charOut); + MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent); + MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent); + p++; + } + return mp; +} + +LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to) +{ +LPCTSTR sp = begin, dp; +LPCTSTR stp, rest, tail, ssp, oldssp, sep; +int ssub, esub; +int es; +int i; + for(int ss=from;ss=sp); // It did work.. It should've.. + } + ssub=ss+1; + esub=es-1; + // Did innards match? + if(MatchSlow(sp,rest,ssub,esub)){ + dp = MatchDissect(sp,rest,ssub,esub); + ASSERT(dp==rest); + }else // nope.. + ASSERT(sp==rest); + sp = rest; + break; + case CSop::opPlus0: + stp=end; + for(;;){ + // How long could this one be?? + rest = MatchSlow(sp,stp,ss,es); + ASSERT(rest); // It did.. It should've.. + // Could the rest match the rest? + tail = MatchSlow(rest,end,es,to); + if(tail==end) + break; // yup. + // nope.. + stp=rest-1; + ASSERT(stp>=sp); // It should've worked, we believe.. + } + ssub=ss+1; + esub=es-1; + ssp=sp; + oldssp=ssp; + for(;;){ // Find last match of innards.. + sep = MatchSlow(ssp,rest,ssub,esub); + if((!sep) || sep==ssp) + break; // Failed or matched nothin' + oldssp=ssp; + ssp=sep; + } + if(!sep){ + // Last successfull.. + sep=ssp; + ssp=oldssp; + } + ASSERT(sep=rest); // Must exhaust substring they say.. + VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest); // Assert or verify - that is the question.. + dp = MatchDissect(ssp,sep,ssub,esub); + ASSERT(dp==sep); + sp=rest; + break; + case CSop::opChoice0: + stp = end; + for(;;){ + // how long.. + rest = MatchSlow(sp,stp,ss,es); + ASSERT(rest); + // Could it.. + tail = MatchSlow(rest,end,es,to); + if(tail==end) + break; // y + // n + stp = rest-1; + ASSERT(stp>=sp); + } + ssub=ss+1; + esub=ss+m_Strip[ss].m_Operand-1; + ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); + for(;;){ // Find first matching branch.. + if(MatchSlow(sp,rest,ssub,esub)==rest) + break; + // this one missed, try next.. + ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); + esub++; + ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); + ssub=esub+1; + esub+=m_Strip[esub].m_Operand; + if(m_Strip[esub].m_Operator==CSop::opOr1) + esub--; + else + ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); + } + dp = MatchDissect(sp,rest,ssub,esub); + ASSERT(dp==rest); + sp=rest; + break; + case CSop::opPlus1: + case CSop::opQuest1: + case CSop::opOr0: + case CSop::opOr1: + case CSop::opChoice1: + ASSERT(FALSE); + break; + case CSop::opLeftParen: + i = m_Strip[ss].m_Operand; + ASSERT(0m_mBegin && !isWordableChar(*(sp-1)))) && (spm_mBegin && isWordableChar(*(sp-1))))) + return NULL; + break; + case CSop::opQuest1: + break; + case CSop::opOr0: // Matches null, but needs to skip + ss++; + s = m_Strip[ss]; + do{ + ASSERT(s.m_Operator==CSop::opOr1); + ss+=s.m_Operand; + }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1); + // Now we should note that ss++ gets us past the Choice1.. + break; + default: + // Have to make a choice.. + hard=TRUE; + break; + } + } + if(!hard){ // That was it.. + if(sp!=end) + return NULL; + return sp; + } + ss--; // Adjust for ther for's final increment.. + // Hard stuff.. is going on and on.. +CSop s = m_Strip[ss]; +int i, len, offsave; +int ssub,esub; +LPCTSTR ssp, dp; + switch(s.m_Operator){ + case CSop::opBackRef0: // The vilest depths they say.. If I only knew what the 'viles' stands for.. + i = s.m_Operand; + ASSERT(0=0); + len = m_Matches[i].GetLength(); + ASSERT((end-m_mBegin)>=len); + if(sp>end-len) + return NULL; // Not enough left to match.. + ssp = m_mBegin+m_Matches[i].m_Begin; + if(memcmp(sp,ssp,len)) + return NULL; + while(m_Strip[ss]!=CSop(CSop::opBackRef1,i)) + ss++; + return MatchBackRef(sp+len,end,ss+1,to,level-1); + break; + case CSop::opQuest0: // to null or not they wonder.. + dp = MatchBackRef(sp,end,ss+1,to,level); + if(dp) + return dp; // not.. + return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1); + break; + case CSop::opPlus0: + ASSERT(m_mLastPos.GetSize()); + ASSERT(level+1 <= m_Pluses); + m_mLastPos[level+1]=sp; + return MatchBackRef(sp,end,ss+1,to,level+1); + break; + case CSop::opPlus1: + if(sp == m_mLastPos[level]) // Last pass matched null + return MatchBackRef(sp,end,ss+1,to,level-1); + // Try another pass.. + m_mLastPos[level]=sp; + dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level); + if(dp) + return dp; + return MatchBackRef(sp,end,ss+1,to,level-1); + break; + case CSop::opChoice0: // find the right one, ifany + ssub = ss+1; + esub = ss+s.m_Operand-1; + ASSERT(m_Strip[esub].m_Operator==CSop::opOr0); + for(;;){ // Find first matching branch. + dp = MatchBackRef(sp,end,ssub,esub,level); + if(dp) + return dp; + // This one missed, try next one.. + if(m_Strip[esub].m_Operator==CSop::opChoice1) + return NULL; // There is none.. + esub++; + ASSERT(m_Strip[esub].m_Operator==CSop::opOr1); + ssub=esub+1; + esub+=m_Strip[esub].m_Operand; + if(m_Strip[esub].m_Operator==CSop::opOr1) + esub--; + else + ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1); + } + break; + case CSop::opLeftParen: // Must undo assignment if rest fails.. + i = s.m_Operand; + ASSERT(0= m_Matches.GetSize()) + return rv; + if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0) + return rv; + ASSERT(m_Matches[match].m_Begin