author | Michael Krelin <hacker@klever.net> | 2004-07-05 01:53:09 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2004-07-05 01:53:09 (UTC) |
commit | a1487b3fc0313408525cd5b2f3bc4a462df351f7 (patch) (side-by-side diff) | |
tree | 010aec6c2abd5ada2f909ac6798c93bf22508bf5 /shared-code/RegEx.cpp | |
download | klog-a1487b3fc0313408525cd5b2f3bc4a462df351f7.zip klog-a1487b3fc0313408525cd5b2f3bc4a462df351f7.tar.gz klog-a1487b3fc0313408525cd5b2f3bc4a462df351f7.tar.bz2 |
initial commit into svn repository
git-svn-id: http://svn.klever.net/kin/klog/trunk@1 fe716a7a-6dde-0310-88d9-d003556173a8
-rw-r--r-- | shared-code/RegEx.cpp | 1697 |
1 files changed, 1697 insertions, 0 deletions
diff --git a/shared-code/RegEx.cpp b/shared-code/RegEx.cpp new file mode 100644 index 0000000..b7bab62 --- a/dev/null +++ b/shared-code/RegEx.cpp @@ -0,0 +1,1697 @@ +#include "../stdafx.h"
+#include "RegEx.h"
+
+#define isWordableChar(c) (isalnum(c) || (c)=='_')
+
+BOOL CRegEx::Compile(LPCTSTR regex,int flags)
+{
+ ASSERT(!((flags®Extended) && (flags®Literal)));
+ m_Matches.RemoveAll();
+ m_Strip.RemoveAll();
+ m_Strip.SetSize(0,15);
+ m_Pattern=regex;
+ m_ParsePointer=0;
+ m_Error=0;
+ m_Sets.RemoveAll();
+ m_Flags=flags;
+ m_iFlags=0;
+ m_BOLs=m_EOLs=0;
+ m_Subexps = 0;
+ m_Categories=1; // 0 is 'everything else'
+ m_bBackRefs=FALSE;
+ memset(m_Category,0,sizeof(m_Category));
+
+ // Go ahead.
+ m_Error || m_Strip.Add(CSop(CSop::opEnd));
+ if(flags®Extended){
+ ParseERE();
+ }else if(flags®Literal){
+ ParseLiteral();
+ }else{
+ ParseBRE();
+ }
+ m_Error || m_Strip.Add(CSop(CSop::opEnd));
+ Categorize();
+ m_Strip.FreeExtra();
+ FigureMust();
+ m_Pluses=CountPluses();
+ if(m_iFlags&iflagsBad){
+ m_Error || (m_Error=regeAssert);
+ // ??? point to nuls? ;-)
+ }
+ // We may wish to free some memory here if we're erroneous (ie. m_Error..)
+ m_ParseParens.RemoveAll();
+#ifdef _DEBUG
+ if(m_Error){
+ CString tmp;
+ tmp.Format("RE: ParseError: %d\n",m_Error);
+ TRACE0(tmp);
+ }
+// DumpStrip(afxDump);
+#endif
+ return (m_bCompiled=(!m_Error));
+}
+
+BOOL CRegEx::Match(LPCTSTR src,int flags)
+{
+ if(!m_bCompiled)
+ return FALSE;
+ if(m_iFlags&iflagsBad)
+ return FALSE;
+ m_Input=src;
+ m_mFlags=flags;
+ m_mPointer=m_Input;
+ m_mBegin=m_Input;
+ m_mEnd=&m_mBegin[m_Input.GetLength()];
+ ASSERT(m_mPointer<=m_mEnd);
+ m_Matches.RemoveAll();
+ if(!m_Must.IsEmpty()){
+ if(m_Input.Find(m_Must)<0)
+ return FALSE;
+ }
+ // Go ahead..
+int stripLen = m_Strip.GetSize();
+ m_mLastPos.SetSize(0);
+ for(int tmp=0;tmp<stripLen;tmp++)
+ m_Strip[tmp].m_MatchData=0;
+LPCTSTR beginp = m_mBegin;
+LPCTSTR endp;
+ for(;;){
+ endp = MatchFast(beginp);
+ if(!endp)
+ return FALSE;
+ if((m_mFlags®NoSubExpressions) && !m_bBackRefs)
+ break;
+ ASSERT(m_cOldP);
+ for(;;){
+ endp = MatchSlow(m_cOldP,m_mEnd,1,stripLen-1);
+ if(endp)
+ break;
+ ASSERT(m_cOldP<m_mEnd);
+ m_cOldP++;
+ }
+ if((m_mFlags®OneMatch) && !m_bBackRefs)
+ break;
+ // Oh, his, we want the subexpression..
+ m_Matches.SetSize(m_Subexps+1);
+ LPCTSTR dp;
+ if(!m_bBackRefs && !(m_mFlags®BackRefs)){
+ dp = MatchDissect(m_cOldP,endp,1,stripLen-1);
+ }else{
+ if(m_Pluses>0 && !m_mLastPos.GetSize())
+ m_mLastPos.SetSize(m_Pluses);
+ dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
+ }
+ if(dp)
+ break;
+ // Uh.. oh.. we couldn't find a subexpression-level match
+ ASSERT(m_bBackRefs);
+ ASSERT(m_Pluses==0 || m_mLastPos.GetSize());
+ for(;;){
+ if(dp || endp <= m_cOldP)
+ break; // defeat.. ?
+ endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1);
+ if(!endp)
+ break; // defeat.. ?
+ // Try it on a shorter possibility..
+#ifdef _DEBUG
+ for(tmp=1;tmp<=m_Subexps;tmp++)
+ ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0);
+#endif
+ dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
+ }
+ ASSERT((!dp) || dp==endp);
+ if(dp) // Found a shorter one..
+ break;
+ // Despite initial appearances, there is no match here
+ beginp = m_cOldP+1;
+ ASSERT(beginp<=m_mEnd);
+ }
+ // Fill in the detail if so requested..
+ if(!(m_mFlags®NoSubExpressions)){
+ if(!m_Matches.GetSize())
+ m_Matches.SetSize(1);
+ m_Matches[0].m_Begin=m_cOldP-m_mBegin;
+ m_Matches[0].m_End=endp-m_mBegin;
+ }
+ m_mLastPos.RemoveAll();
+ return TRUE;
+}
+
+CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags)
+{
+ // ***
+ return CString();
+}
+
+void CRegEx::ParseERE(int stop)
+{
+UCHAR c;
+BOOL first=TRUE;
+int prevF, prevB;
+ for(;;){
+ int co = m_Strip.GetSize();
+ while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop)
+ ParseEREexp();
+ if(m_Strip.GetSize()==co){
+ m_Error || (m_Error=regeEmpty);
+ // ??? point to nuls?
+ }
+ if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|')
+ break;
+ else
+ m_ParsePointer++;
+ if(first){
+ StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1));
+ prevF = prevB = co;
+ first=FALSE;
+ }
+ m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB));
+ prevB = m_Strip.GetSize()-1;
+ m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
+ prevF = m_Strip.GetSize();
+ m_Error || m_Strip.Add(CSop(CSop::opOr1,0)); // offset isn't really right.. very so..
+ }
+ if(!first){
+ m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
+ m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB));
+ }
+ ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop);
+}
+
+void CRegEx::ParseEREexp()
+{
+ ASSERT(m_ParsePointer < m_Pattern.GetLength());
+UCHAR c = m_Pattern[m_ParsePointer++];
+int pos = m_Strip.GetSize();
+int subno;
+int count, count2;
+BOOL wascaret=FALSE;
+ switch(c){
+ case '(':
+ if(!(m_ParsePointer<m_Pattern.GetLength())){
+ TRACE0("RE: '(' at the end of the pattern\n");
+ if(!m_Error)
+ m_Error = regeParen;
+ // ??? point to nuls?
+ }
+ m_Subexps++;
+ subno=m_Subexps;
+ m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
+ m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
+ if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')')
+ ParseERE(')');
+ VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
+ m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
+ if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){
+ TRACE0("RE: No matching ')'\n");
+ if(!m_Error)
+ m_Error = regeParen;
+ // ??? point to nuls?
+ }
+ break;
+ case '^':
+ m_Error || m_Strip.Add(CSop(CSop::opBOL));
+ m_iFlags|=iflagsUseBOL;
+ m_BOLs++;
+ wascaret=TRUE;
+ break;
+ case '$':
+ m_Error || m_Strip.Add(CSop(CSop::opEOL));
+ m_iFlags|=iflagsUseEOL;
+ m_EOLs++;
+ break;
+ case '|':
+ TRACE0("RE: '|' outside of expression\n");
+ if(!m_Error)
+ m_Error = regeEmpty;
+ // ??? point to nuls?
+ break;
+ case '*':
+ case '+':
+ case '?':
+ TRACE0("RE: '*'/'+'/'?' with no previous expression\n");
+ if(!m_Error)
+ m_Error = regeBadRepeat;
+ // ??? point to nuls?
+ break;
+ case '.':
+ if(m_Flags®NewLine)
+ EmitNonNewLineAny();
+ else
+ m_Error || m_Strip.Add(CSop(CSop::opAny));
+ break;
+ case '[':
+ ParseBracket();
+ break;
+ case '\\':
+ if(m_ParsePointer >= m_Pattern.GetLength()){
+ TRACE0("RE: '\\' at the end of the pattern\n");
+ if(!m_Error)
+ m_Error = regeEscape;
+ // ??? point to nuls?
+ }else{
+ c = m_Pattern[m_ParsePointer++];
+ EmitOrdinary(c);
+ }
+ break;
+ case '{':
+ if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){
+ TRACE0("RE: '{' with no repeat count\n");
+ if(!m_Error)
+ m_Error = regeBadRepeat;
+ // ??? point to nuls?
+ }
+ // Fallthrough..
+ default:
+ EmitOrdinary(c);
+ break;
+ }
+ if(m_ParsePointer >= m_Pattern.GetLength())
+ return;
+ c = m_Pattern[m_ParsePointer];
+ // Call a '{' repetition if followed by a digit
+ if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) ))
+ return; // No repetitor - done.
+ m_ParsePointer++;
+ if(wascaret){
+ TRACE0("RE: repetitive '^' detected\n");
+ if(!m_Error)
+ m_Error = regeBadRepeat;
+ // ??? point to nuls?
+ }
+ switch(c){
+ case '*': // Implemeted as +?
+ // + expression
+ StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
+ // ? expression
+ StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
+ break;
+ case '+':
+ // + expression
+ StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
+ break;
+ case '?':
+ // Kludge - emit y? as (y|) until subtle bug gets fixed :-)
+ StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
+ m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
+ m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
+ m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
+ break;
+ case '{':
+ count = ParseCount();
+ if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
+ m_ParsePointer++;
+ if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available
+ count2=ParseCount();
+ if(!(count<=count2)){
+ TRACE0("RE: Disbalanced counts in '{}' repeater\n");
+ m_Error || (m_Error=regeBadBrace);
+ // ??? point to nuls?
+ }
+ }else // Single number with comma
+ count2=256; // Infinity
+ }else // Single number
+ count2=count;
+ EmitRepeat(pos,count,count2);
+ if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){
+ // No '}'..
+ TRACE0("RE: No immediately following '}' of '{' expression\n");
+ while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}')
+ m_ParsePointer++;
+ if(m_ParsePointer >= m_Pattern.GetLength()){
+ TRACE0("RE: No closing '}' found\n");
+ m_Error || (m_Error=regeBrace);
+ }else
+ m_Error || (m_Error=regeBadBrace);
+ // ??? point to nuls?
+ }else
+ m_ParsePointer++;
+ break;
+ }
+ if(m_ParsePointer >= m_Pattern.GetLength())
+ return;
+ c = m_Pattern[m_ParsePointer];
+ if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1]))))
+ return;
+ TRACE0("RE: Double repeater\n");
+ m_Error || (m_Error=regeBadRepeat);
+ // ??? point to nuls?
+}
+
+void CRegEx::StripInsert(int pos,CSop& sop)
+{
+ if(m_Error)
+ return;
+int sn = m_Strip.GetSize();
+ m_Strip.InsertAt(pos,sop);
+ for(int tmp=1;tmp<m_ParseParens.GetSize();tmp++){
+ if(m_ParseParens[tmp].m_Begin>=pos)
+ m_ParseParens[tmp].m_Begin++;
+ if(m_ParseParens[tmp].m_End>=pos)
+ m_ParseParens[tmp].m_End++;
+ }
+}
+
+void CRegEx::EmitOrdinary(UCHAR c)
+{
+ if(m_Flags®IgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){
+ // Emit both cases
+ CString savePattern = m_Pattern;
+ int savePointer = m_ParsePointer;
+ m_Pattern=c;
+ m_Pattern+=']';
+ m_ParsePointer=0;
+ ParseBracket();
+ m_Pattern=savePattern;
+ m_ParsePointer=savePointer;
+ }else{
+ m_Error || m_Strip.Add(CSop(CSop::opChar,c));
+ if(!m_Category[(BYTE)c])
+ m_Category[(BYTE)c]=m_Categories++;
+ }
+}
+
+void CRegEx::EmitNonNewLineAny()
+{
+ // Kludges're going on and on..
+CString savePattern = m_Pattern;
+int savePointer = m_ParsePointer;
+ m_Pattern="^\n]";
+ m_ParsePointer=0;
+ ParseBracket();
+ m_Pattern=savePattern;
+ m_ParsePointer=savePointer;
+}
+
+int CRegEx::ParseCount()
+{
+BOOL nonEmpty=FALSE;
+int rv = 0;
+UCHAR c;
+ while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){
+ rv = rv*10 + c-'0';
+ nonEmpty=TRUE;
+ m_ParsePointer++;
+ }
+ if(rv>255 || !nonEmpty){
+ m_Error || (m_Error=regeBadBrace);
+ // ??? point to nuls?
+ }
+ return rv;
+}
+
+void CRegEx::ParseBracket()
+{
+ // Dept. of truly sickening special case kludges
+ if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){
+ m_Error || m_Strip.Add(CSop(CSop::opBOW));
+ m_ParsePointer+=6;
+ return;
+ }
+ if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){
+ m_Error || m_Strip.Add(CSop(CSop::opEOW));
+ m_ParsePointer+=6;
+ return;
+ }
+BOOL invert=TRUE;
+ if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^')
+ m_ParsePointer++;
+ else
+ invert=FALSE;
+CSet cset;
+ if(m_ParsePointer < m_Pattern.GetLength()){
+ switch(m_Pattern[m_ParsePointer]){
+ case ']':
+ case '-':
+ cset.Add(m_Pattern[m_ParsePointer]);
+ m_ParsePointer++;
+ break;
+ }
+ }
+ while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("-]")))
+ ParseBracketTerm(cset);
+ if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
+ m_ParsePointer++;
+ cset.Add('-');
+ }
+ if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==']')
+ m_ParsePointer++;
+ else{
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ return;
+ }
+ if(m_Flags®IgnoreCase){
+ for(int tmp=CSet::size-1;tmp>=0;tmp--){
+ if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp)))
+ cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp));
+ }
+ /*
+ if(!cset->m_Multis.IsEmpty())
+ cset.CollatingCase();
+ */
+ }
+ if(invert){
+ for(int tmp=CSet::size-1;tmp>=0;tmp--)
+ if(cset.IsIn(tmp))
+ cset.Sub(tmp);
+ else
+ cset.Add(tmp);
+ if(m_Flags®NewLine)
+ cset.Sub('\n');
+ /*
+ if(!cset.m_Multis.IsEmpty())
+ cset.CollatingInvert();
+ */
+ }
+UCHAR c = cset.GetOnly();
+ if(c){
+ EmitOrdinary(c);
+ }else
+ m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset)));
+}
+
+void CRegEx::CSet::Add(UCHAR c)
+{
+ m_Set[(BYTE)c]=TRUE;
+ m_Hash+=c;
+}
+
+BOOL CRegEx::CSet::IsIn(UCHAR c)
+{
+ return m_Set[(BYTE)c];
+}
+
+void CRegEx::CSet::Sub(UCHAR c)
+{
+ m_Set[(BYTE)c]=FALSE;
+ m_Hash-=c;
+}
+
+UCHAR CRegEx::CSet::GetOnly()
+{
+int rv = 0;
+UCHAR only = 0;
+ for(int tmp=0;tmp<size;tmp++){
+ rv+=m_Set[tmp]?(only=tmp,1):0;
+ }
+ return (rv==1)?only:0;
+}
+
+int CRegEx::StoreSet(CSet& cset)
+{
+ for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
+ if(m_Sets[tmp]==cset)
+ return tmp;
+ return m_Sets.Add(cset);
+}
+
+void CRegEx::ParseBracketTerm(CSet& cset)
+{
+UCHAR c;
+ switch((m_ParsePointer<m_Pattern.GetLength())?m_Pattern[m_ParsePointer]:0){
+ case '[':
+ c = ((m_ParsePointer+1)<m_Pattern.GetLength())?m_Pattern[m_ParsePointer+1]:0;
+ break;
+ case '-':
+ m_Error || (m_Error=regeRange);
+ // ??? point to nuls?
+ return;
+ default:
+ c = 0;
+ break;
+ }
+ switch(c){
+ case ':': // Character class
+ m_ParsePointer+=2;
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ }
+ c = m_Pattern[m_ParsePointer];
+ if(c== '-' || c==']'){
+ m_Error || (m_Error=regeCType);
+ // ??? point to nuls?
+ }
+ ParseBracketCClass(cset);
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ }
+ if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){
+ m_Error || (m_Error=regeCType);
+ // ??? point to nuls?
+ }else
+ m_ParsePointer+=2;
+ break;
+ case '=': // Equivalence class
+ m_ParsePointer+=2;
+ if(m_ParsePointer >= m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ }
+ c = m_Pattern[m_ParsePointer];
+ if(c== '-' || c==']'){
+ m_Error || (m_Error=regeCollate);
+ // ??? point to nuls?
+ }
+ ParseBracketEClass(cset);
+ if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){
+ m_Error || (m_Error=regeCollate);
+ // ??? point to nuls?
+ }else
+ m_ParsePointer+=2;
+ break;
+ default: // Symbol, character or range
+ {
+ UCHAR start, finish;
+ start = ParseBracketSymbol();
+ if((m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-') /*&& (m_ParsePointer+1)<m_Pattern.GetLength() && m_Pattern[m_ParsePointer+1]==']'*/){
+ // I believe the expression above is seetwo..
+ // range.
+ m_ParsePointer++;
+ if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
+ m_ParsePointer++;
+ finish='-';
+ }else
+ finish=ParseBracketSymbol();
+ }else
+ finish=start;
+ if(((BYTE)start)>((BYTE)finish)){
+ m_Error || (m_Error=regeRange);
+ // ??? point to nuls?
+ }
+ for(int tmp=start;tmp<=(BYTE)finish;tmp++)
+ cset.Add(tmp);
+ }
+ break;
+ }
+}
+
+void CRegEx::ParseBracketCClass(CSet& cset)
+{
+static struct {
+ char *className;
+ char *classChars;
+} cc[] = {
+ {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"},
+ {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
+ {"blank"," \t"},
+ {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"},
+ {"digit","0123456789"},
+ {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
+ {"lower","abcdefghijklmnopqrstuvwxyz"},
+ {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "},
+ {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
+ {"space","\t\n\v\f\r "},
+ {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
+ {"xdigit","0123456789ABCDEFabcdef"}
+};
+CString cclass;
+UCHAR c;
+ while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){
+ cclass+=c;
+ m_ParsePointer++;
+ }
+char *classChars = NULL;
+ for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){
+ if(!cclass.CompareNoCase(cc[tmp].className)){
+ classChars=cc[tmp].classChars;
+ break;
+ }
+ }
+ if(!classChars){
+ m_Error || (m_Error=regeCType);
+ // ??? point to nuls?
+ return;
+ }
+ while(*classChars)
+ cset.Add(*(classChars++));
+ // --- multis
+}
+
+void CRegEx::ParseBracketEClass(CSet& cset)
+{
+ cset.Add(ParseBracketCollatingElement('='));;
+}
+
+UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term)
+{
+static struct {
+ char *entityName;
+ char entity;
+} cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} };
+CString seeTwo;
+ seeTwo=term;
+ seeTwo+=']';
+CString entityName;
+ while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(seeTwo)))
+ entityName+=m_Pattern[m_ParsePointer++];
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ return 0;
+ }
+ for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++)
+ if(!entityName.CompareNoCase(cc[tmp].entityName))
+ return cc[tmp].entity;
+ if(entityName.GetLength()==1)
+ return entityName[0];
+ m_Error || (m_Error=regeCollate);
+ // ??? point to nuls?
+ return 0;
+}
+
+UCHAR CRegEx::ParseBracketSymbol()
+{
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBracket);
+ // ??? point to nuls?
+ }
+ if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
+ m_ParsePointer+=2;
+ else
+ return m_Pattern[m_ParsePointer++];
+ // Collating symbol
+UCHAR rv = ParseBracketCollatingElement('.');
+ if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
+ m_ParsePointer+=2;
+ else{
+ m_Error || (m_Error=regeCollate);
+ // ??? point to nuls?
+ }
+ return rv;
+}
+
+void CRegEx::EmitRepeat(int pos,int from,int to)
+{
+ if(m_Error)
+ return;
+ ASSERT(from<=to);
+int finish = m_Strip.GetSize();
+int copy;
+#define N 2
+#define INF 3
+#define REP(f,t) ((f)*8+(t))
+#define MAP(n) (((n)<=1)?(n):((n)==256)?INF:N)
+ switch(REP(MAP(from),MAP(to))){
+ case REP(0,0): // must be user doing ths??
+ m_Strip.SetSize(pos);
+ break;
+ case REP(0,1): // as in '?'
+ case REP(0,N): // as in '{1,n}?'
+ case REP(0,INF): // as in '{1,}?'
+ // Kludge - emit y? as (y|) until something gets fixed..
+ StripInsert(pos,CSop(CSop::opChoice0,pos));
+ EmitRepeat(pos+1,1,to);
+ m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
+ m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
+ m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
+ m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
+ break;
+ case REP(1,1):
+ break;
+ case REP(1,N): // as in 'x?x{1,n-1}'
+ // Kludge again..
+ StripInsert(pos,CSop(CSop::opChoice0,pos));
+ m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
+ m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
+ m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
+ m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
+ copy = StripDuplicate(pos+1,finish+1);
+ ASSERT(copy==(finish+4));
+ EmitRepeat(copy,1,to-1);
+ break;
+ case REP(1,INF): // as in '+'
+ StripInsert(pos,CSop(CSop::opPlus0,pos));
+ m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
+ break;
+ case REP(N,N): // as in 'xx{from-1,to-1}'
+ copy = StripDuplicate(pos,finish);
+ EmitRepeat(copy,from-1,to-1);
+ break;
+ case REP(N,INF): // as in 'xx{n-1,}'
+ copy = StripDuplicate(pos,finish);
+ EmitRepeat(copy,from-1,to);
+ break;
+#ifndef NDEBUG
+ default:
+ ASSERT(FALSE);
+ break;
+#endif
+ }
+#undef MAP
+#undef REP
+#undef INF
+#undef N
+}
+
+int CRegEx::StripDuplicate(int from,int to)
+{
+int rv = m_Strip.GetSize();
+ ASSERT(from<=to);
+ if(from==to)
+ return rv;
+ // Maybe should be optimized for copying the whole thing.
+ for(int tmp=from;tmp<to;tmp++)
+ m_Strip.Add(m_Strip[tmp]);
+ return rv;
+}
+
+void CRegEx::Categorize()
+{
+ if(m_Error)
+ return;
+ for(int tmp=0;tmp<(sizeof(m_Category)/sizeof(m_Category[0]));tmp++)
+ if((!m_Category[tmp]) && IsInSets(tmp)){
+ int cat = m_Categories++;
+ m_Category[tmp]=cat;
+ for(int c=tmp+1;c<(sizeof(m_Category)/sizeof(m_Category[0]));c++)
+ if((!m_Category[c]) && IsInSameSets(tmp,c))
+ m_Category[c]=cat;
+ }
+}
+
+BOOL CRegEx::IsInSets(UCHAR c)
+{
+ for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
+ if(m_Sets[tmp].IsIn(c))
+ return TRUE;
+ return FALSE;
+}
+
+BOOL CRegEx::IsInSameSets(UCHAR c1,UCHAR c2)
+{
+ for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
+ if(m_Sets[tmp].IsIn(c1)!=m_Sets[tmp].IsIn(c2))
+ return FALSE;
+ return TRUE;
+}
+
+void CRegEx::FigureMust()
+{
+ if(m_Error)
+ return;
+ m_Must.Empty();
+int stripLen = m_Strip.GetSize();
+int seqStart, seqLength = 0;
+int mustStart, mustLength = 0;
+ for(int tmp=1;tmp<stripLen;tmp++){
+ switch(m_Strip[tmp].m_Operator){
+ case CSop::opChar:
+ if(!seqLength)
+ seqStart=tmp;
+ seqLength++;
+ break;
+ case CSop::opPlus0:
+ case CSop::opLeftParen:
+ case CSop::opRightParen:
+ break; // Break, since they don't break the sequence
+ case CSop::opQuest0:
+ case CSop::opChoice0:
+ // These ones we skip.
+ do{
+ tmp+=m_Strip[tmp].m_Operand;
+ // I still think it could be ASSERTed..
+ if(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1 && m_Strip[tmp].m_Operator!=CSop::opOr1){
+ m_iFlags|=iflagsBad;
+ return;
+ }
+ }while(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1);
+ // Fallthrough..
+ default:
+ // End of sequence
+ if(seqLength>mustLength){
+ mustStart=seqStart;
+ mustLength=seqLength;
+ }
+ seqLength=0;
+ break;
+ }
+ } // Hmm.. originally it's meant to be do while not opEnd..
+ if(!mustLength)
+ return;
+ // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not..
+ for(tmp=0;tmp<seqLength;tmp++){
+ while(m_Strip[seqStart+tmp].m_Operator!=CSop::opChar)
+ ASSERT(tmp<seqLength);
+ m_Must+=m_Strip[tmp].m_Operand;
+ }
+}
+
+int CRegEx::CountPluses()
+{
+ if(m_Error)
+ return 0;
+int stripLen = m_Strip.GetSize();
+int rv = 0;
+int nest = 0;
+ for(int tmp=0;tmp<stripLen;tmp++){
+ switch(m_Strip[tmp].m_Operator){
+ case CSop::opPlus0:
+ nest++;
+ break;
+ case CSop::opPlus1:
+ if(nest>rv)
+ rv = nest;
+ nest--;
+ break;
+ }
+ } // Again, originally we were supposed to scan till opEnd..
+ if(nest)
+ m_iFlags|=iflagsBad; // Could this be an ASSERTion?
+ return rv;
+}
+
+void CRegEx::ParseLiteral()
+{
+ if(!m_Pattern.GetLength()){
+ m_Error || (m_Error=regeEmpty);
+ // ??? point to nuls?
+ }
+ while(m_ParsePointer < m_Pattern.GetLength())
+ EmitOrdinary(m_Pattern[m_ParsePointer++]);
+}
+
+void CRegEx::ParseBRE(int stopa,int stopb)
+{
+int start = m_Strip.GetSize();
+BOOL first=TRUE;
+BOOL wasDollar=FALSE;
+ if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^'){
+ m_ParsePointer++;
+ m_Error || m_Strip.Add(CSop(CSop::opBOL,0));
+ m_iFlags|=iflagsUseBOL;
+ m_BOLs++;
+ }
+CString stopstr;
+ if(stopa){
+ stopstr+=stopa;
+ if(stopb)
+ stopstr+=stopb;
+ }
+ while(m_ParsePointer < m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(stopstr))){
+ wasDollar = ParseBREexp(first);
+ first=FALSE;
+ }
+ if(wasDollar){ // Trailing anchor that was..
+ m_Strip.SetSize(m_Strip.GetSize()-1);
+ m_Error || m_Strip.Add(CSop(CSop::opEOL,0));
+ m_iFlags|=iflagsUseEOL;
+ m_EOLs++;
+ }
+ if(m_Strip.GetSize()==start){
+ m_Error || (m_Error=regeEmpty);
+ // ??? point to nuls?
+ }
+}
+
+BOOL CRegEx::ParseBREexp(BOOL ordinaryStar)
+{
+int subno;
+int pos = m_Strip.GetSize();
+ ASSERT(m_ParsePointer<m_Pattern.GetLength());
+int c = m_Pattern[m_ParsePointer++];
+ if(c=='\\'){
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeEscape);
+ // ??? point to nuls
+ }else
+ c = 0x100|m_Pattern[m_ParsePointer++];
+ }
+ switch(c){
+ case '.':
+ if(m_Flags®NewLine)
+ EmitNonNewLineAny();
+ else
+ m_Error || m_Strip.Add(CSop(CSop::opAny,0));
+ break;
+ case '[':
+ ParseBracket();
+ break;
+ case 0x100|'{':
+ m_Error || (m_Error=regeBadRepeat);
+ // ??? point to nuls?
+ break;
+ case 0x100|'(':
+ m_Subexps++;
+ subno=m_Subexps;
+ m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
+ m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
+ if(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)")))
+ ParseBRE('\\',')');
+ VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
+ m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
+ if((m_ParsePointer+1) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)"))
+ m_ParsePointer+=2;
+ else{
+ m_Error || (m_Error=regeParen);
+ // ??? point to nuls?
+ }
+ break;
+ case 0x100|')':
+ case 0x100|'}':
+ // Can this possibly happen?!
+ m_Error || (m_Error=regeParen);
+ // ??? point to nuls?
+ break;
+ case 0x100|'1':
+ case 0x100|'2':
+ case 0x100|'3':
+ case 0x100|'4':
+ case 0x100|'5':
+ case 0x100|'6':
+ case 0x100|'7':
+ case 0x100|'8':
+ case 0x100|'9':
+ {
+ int i = (c&0xFF)-'0';
+ if(i < m_ParseParens.GetSize() && m_ParseParens[i].m_End){
+ m_Error || m_Strip.Add(CSop(CSop::opBackRef0,i));
+ ASSERT(m_ParseParens[i].m_Begin);
+ ASSERT(m_Strip[m_ParseParens[i].m_Begin].m_Operator==CSop::opLeftParen);
+ ASSERT(m_Strip[m_ParseParens[i].m_End].m_Operator==CSop::opRightParen);
+ StripDuplicate(m_ParseParens[i].m_Begin+1,m_ParseParens[i].m_End);
+ m_Error || m_Strip.Add(CSop(CSop::opBackRef1,i));
+ }else{
+ m_Error || (m_Error=regeSubReg);
+ // ??? point to nuls?
+ }
+ m_bBackRefs=TRUE;
+ }
+ break;
+ case '*':
+ if(!ordinaryStar){
+ m_Error || (m_Error=regeBadRepeat);
+ // ??? point to nuls?
+ }
+ // Fallthrough..
+ default:
+ EmitOrdinary(c&0xFF);
+ break;
+ }
+ if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='*'){
+ m_ParsePointer++;
+ // as in '+?'
+ StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
+ StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
+ m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
+ }else if ((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\{")){
+ m_ParsePointer+=2;
+ int count = ParseCount();
+ int count2;
+ if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
+ m_ParsePointer++;
+ if(m_ParsePointer<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer])){
+ count2=ParseCount();
+ if(count>count2){
+ m_Error || (m_Error=regeBadBrace);
+ // ??? poin to nuls?
+ }
+ }else // Single number with comma
+ count2=256;
+ }else // Single number
+ count2=count;
+ EmitRepeat(pos,count,count2);
+ if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){
+ while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")))
+ m_ParsePointer++;
+ if(m_ParsePointer>=m_Pattern.GetLength()){
+ m_Error || (m_Error=regeBrace);
+ // ??? point to nuls?
+ }
+ m_Error || (m_Error=regeBadBrace);
+ }else
+ m_ParsePointer+=2;
+ }else if(c=='$')
+ return TRUE;
+ return FALSE;
+}
+
+CRegEx::CRegEx()
+{
+ m_bCompiled=FALSE;
+}
+
+LPCTSTR CRegEx::MatchFast(LPCTSTR begin)
+{
+ MatchStatesClear(CSop::stCurrent);
+ m_Strip[1].m_MatchData|=CSop::stCurrent;
+int stripLen = m_Strip.GetSize();
+ MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent);
+ MatchStatesCopy(CSop::stFresh,CSop::stCurrent);
+LPCTSTR coldp = NULL;
+LPCTSTR p = begin;
+int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]);
+ for(;;){
+ // next character..
+ int lastc = c;
+ c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
+ if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh))
+ coldp=p;
+ // Is there an EOL and/or BOL between lastc and c? - they ask..
+ int flagc=0;
+ int i = 0;
+ if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){
+ flagc=charBOL;
+ i=m_BOLs;
+ }
+ if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){
+ flagc=(flagc==charBOL)?charBOLEOL:charEOL;
+ i+=m_EOLs;
+ }
+ if(i){
+ for(;i>0;i--)
+ MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
+ }
+ // What about a word boundary? - they wonder..
+ if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c)))
+ flagc = charBOW;
+ if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
+ flagc = charEOW;
+ if(flagc==charBOW || flagc==charEOW){
+ MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
+ }
+ // Are we done? Now WE wonder..
+ if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd)
+ break; // They insist I need to note break out.. Okay, I do..
+ // Nope, we're not done. We have to face this character..
+ MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
+ MatchStatesCopy(CSop::stCurrent,CSop::stFresh);
+ ASSERT(c!=charOut);
+ MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent);
+ p++;
+ }
+ ASSERT(coldp);
+ m_cOldP=coldp; // *** I believe this variable can be changed 'in-place'
+ if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent)
+ return &p[1];
+ else
+ return NULL;
+}
+
+void CRegEx::MatchStatesClear(BYTE mask)
+{
+int stripLen = m_Strip.GetSize();
+ for(int tmp=0;tmp<stripLen;tmp++)
+ m_Strip[tmp].m_MatchData&=~mask;
+}
+
+void CRegEx::MatchStep(int from,int to,BYTE maskBefore,int charCode,BYTE maskAfter)
+{
+BOOL i;
+int look;
+int here = from;
+ for(int pc=from;pc!=to;pc++,here++){
+ CSop s=m_Strip[pc];
+ switch(s.m_Operator){
+ case CSop::opEnd:
+ ASSERT(pc==(to-1));
+ break;
+ case CSop::opChar:
+ // Only characters can match..
+ ASSERT((charCode<charOut) || charCode!=s.m_Operand);
+ if(charCode==s.m_Operand)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opBOL:
+ if(charCode==charBOL || charCode==charBOLEOL)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opEOL:
+ if(charCode==charEOL || charCode==charBOLEOL)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opBOW:
+ if(charCode==charBOW)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opEOW:
+ if(charCode==charEOW)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opAny:
+ if(charCode<charOut)
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opAnyOf:
+ if(charCode<charOut && m_Sets[s.m_Operand].m_Set[charCode])
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
+ break;
+ case CSop::opBackRef0: // Ignored here..
+ case CSop::opBackRef1:
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opPlus0:
+ // Forward, this is just an empty, comment says..
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opPlus1:
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ i =(m_Strip[here-s.m_Operand].m_MatchData&maskAfter)!=0;
+ m_Strip[here-s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ if(!i && (m_Strip[here-s.m_Operand].m_MatchData&maskAfter)){
+ // oho, must reconsider loop body, comment says.. what's so 'oho' about it?
+ pc-=s.m_Operand+1;
+ here=pc;
+ }
+ break;
+ case CSop::opQuest0:
+ // two branches, both forward..
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opQuest1:
+ // just an empty.. aren't we tired of justanempties?
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opLeftParen: // they say it's not significan there..
+ case CSop::opRightParen:
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opChoice0: // mark the first two branches..
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
+ m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+ case CSop::opOr0: // done a branch, find the end of choice..
+ if(m_Strip[here].m_MatchData&maskAfter){
+ for(look=1;(s=m_Strip[pc+look]).m_Operator!=CSop::opChoice1;look+=s.m_Operand)
+ ASSERT(s.m_Operator==CSop::opOr1);
+ m_Strip[here+look].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ }
+ break;
+ case CSop::opOr1: // Propagate Choice's marking..
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ if(m_Strip[pc+s.m_Operand].m_Operator!=CSop::opChoice1){
+ ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
+ m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ }
+ break;
+ case CSop::opChoice1: // Just empty.. :-)..
+ m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
+ break;
+#ifdef _DEBUG
+ default:
+ ASSERT(FALSE);
+ break;
+#endif
+ }
+ }
+}
+
+void CRegEx::MatchStatesCopy(BYTE dst,BYTE src)
+{
+int stripLen = m_Strip.GetSize();
+ for(int tmp=0;tmp<stripLen;tmp++){
+ // I believe this can be optimized, easily..
+ m_Strip[tmp].m_MatchData&=~dst;
+ m_Strip[tmp].m_MatchData|=(m_Strip[tmp].m_MatchData&src)?dst:0;
+ }
+}
+
+BOOL CRegEx::MatchStatesEqual(BYTE m1,BYTE m2)
+{
+int stripLen = m_Strip.GetSize();
+ for(int tmp=0;tmp<stripLen;tmp++){
+ BYTE mm = m_Strip[tmp].m_MatchData;
+ if(((mm&m1) && (mm&m2)) || !(mm&(m1|m2)))
+ continue;
+ return FALSE;
+ }
+ return TRUE;
+}
+
+LPCTSTR CRegEx::MatchSlow(LPCTSTR begin,LPCTSTR end,int from,int to)
+{
+ MatchStatesClear(CSop::stCurrent);
+ m_Strip[from].m_MatchData|=CSop::stCurrent;
+ MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
+LPCTSTR mp = NULL;
+int c = (m_mBegin==m_mPointer)?charOut:(int)(BYTE)begin[-1];
+LPCTSTR p = begin;
+ for(;;){
+ // next character..
+ int lastc = c;
+ c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
+ // Now we start to wonder if there is an EOL and/or BOL between lastc and c
+ int flagc = 0;
+ int i = 0;
+ if((lastc=='\n' && m_Flags®NewLine) || (lastc==charOut && !(m_mFlags®NotBOL))){
+ flagc = charBOL;
+ i = m_BOLs;
+ }
+ if((c=='\n' && m_Flags®NewLine) || (c==charOut && !(m_mFlags®NotEOL))){
+ flagc = (flagc==charBOL)?charBOLEOL:charEOL;
+ i+=m_EOLs;
+ }
+ if(i){
+ for(;i>0;i--)
+ MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
+ }
+ // Now we wonder about word boundaries..
+ if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c)))
+ flagc=charBOW;
+ if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
+ flagc=charEOW;
+ if(flagc==charBOW || flagc==charEOW){
+ MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
+ }
+ // Are we done we all wonder??
+ if(m_Strip[to].m_MatchData&CSop::stCurrent)
+ mp=p;
+ if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end)
+ break; // Again, we're obliged to note break out. We do.
+ // Sucks.. we have to face this character..
+ MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
+ MatchStatesCopy(CSop::stCurrent,CSop::stEmpty);
+ ASSERT(c!=charOut);
+ MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent);
+ MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
+ p++;
+ }
+ return mp;
+}
+
+LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to)
+{
+LPCTSTR sp = begin, dp;
+LPCTSTR stp, rest, tail, ssp, oldssp, sep;
+int ssub, esub;
+int es;
+int i;
+ for(int ss=from;ss<to;ss = es){
+ // Identify end of SubRE
+ es = ss;
+ switch(m_Strip[es].m_Operator){
+ case CSop::opPlus0:
+ case CSop::opQuest0:
+ es+=m_Strip[es].m_Operand;
+ break;
+ case CSop::opChoice0:
+ while(m_Strip[es].m_Operator!=CSop::opChoice1)
+ es+=m_Strip[es].m_Operand;
+ break;
+ }
+ es++;
+ // Figure out what it matched
+ switch(m_Strip[ss].m_Operator){
+ case CSop::opEnd:
+ ASSERT(FALSE);
+ break;
+ case CSop::opChar:
+ sp++;
+ break;
+ case CSop::opBOL:
+ case CSop::opEOL:
+ case CSop::opBOW:
+ case CSop::opEOW:
+ break;
+ case CSop::opAny:
+ case CSop::opAnyOf:
+ sp++;
+ break;
+ case CSop::opBackRef0:
+ case CSop::opBackRef1:
+ ASSERT(FALSE);
+ break;
+ // Cases where lenght of match is hard to find..
+ case CSop::opQuest0:
+ stp=end;
+ for(;;){
+ // How long could this one be??
+ rest = MatchSlow(sp,stp,ss,es);
+ ASSERT(rest); // It did match.. It should've..
+ // Could the rest match the rest? (good question)
+ tail = MatchSlow(rest,end,es,to);
+ if(tail==end)
+ break; // yup.
+ // nope, try a shorter match for this one..
+ stp=rest-1;
+ ASSERT(stp>=sp); // It did work.. It should've..
+ }
+ ssub=ss+1;
+ esub=es-1;
+ // Did innards match?
+ if(MatchSlow(sp,rest,ssub,esub)){
+ dp = MatchDissect(sp,rest,ssub,esub);
+ ASSERT(dp==rest);
+ }else // nope..
+ ASSERT(sp==rest);
+ sp = rest;
+ break;
+ case CSop::opPlus0:
+ stp=end;
+ for(;;){
+ // How long could this one be??
+ rest = MatchSlow(sp,stp,ss,es);
+ ASSERT(rest); // It did.. It should've..
+ // Could the rest match the rest?
+ tail = MatchSlow(rest,end,es,to);
+ if(tail==end)
+ break; // yup.
+ // nope..
+ stp=rest-1;
+ ASSERT(stp>=sp); // It should've worked, we believe..
+ }
+ ssub=ss+1;
+ esub=es-1;
+ ssp=sp;
+ oldssp=ssp;
+ for(;;){ // Find last match of innards..
+ sep = MatchSlow(ssp,rest,ssub,esub);
+ if((!sep) || sep==ssp)
+ break; // Failed or matched nothin'
+ oldssp=ssp;
+ ssp=sep;
+ }
+ if(!sep){
+ // Last successfull..
+ sep=ssp;
+ ssp=oldssp;
+ }
+ ASSERT(sep=rest); // Must exhaust substring they say..
+ VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest); // Assert or verify - that is the question..
+ dp = MatchDissect(ssp,sep,ssub,esub);
+ ASSERT(dp==sep);
+ sp=rest;
+ break;
+ case CSop::opChoice0:
+ stp = end;
+ for(;;){
+ // how long..
+ rest = MatchSlow(sp,stp,ss,es);
+ ASSERT(rest);
+ // Could it..
+ tail = MatchSlow(rest,end,es,to);
+ if(tail==end)
+ break; // y
+ // n
+ stp = rest-1;
+ ASSERT(stp>=sp);
+ }
+ ssub=ss+1;
+ esub=ss+m_Strip[ss].m_Operand-1;
+ ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
+ for(;;){ // Find first matching branch..
+ if(MatchSlow(sp,rest,ssub,esub)==rest)
+ break;
+ // this one missed, try next..
+ ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
+ esub++;
+ ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
+ ssub=esub+1;
+ esub+=m_Strip[esub].m_Operand;
+ if(m_Strip[esub].m_Operator==CSop::opOr1)
+ esub--;
+ else
+ ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
+ }
+ dp = MatchDissect(sp,rest,ssub,esub);
+ ASSERT(dp==rest);
+ sp=rest;
+ break;
+ case CSop::opPlus1:
+ case CSop::opQuest1:
+ case CSop::opOr0:
+ case CSop::opOr1:
+ case CSop::opChoice1:
+ ASSERT(FALSE);
+ break;
+ case CSop::opLeftParen:
+ i = m_Strip[ss].m_Operand;
+ ASSERT(0<i && i<=m_Subexps);
+ m_Matches[i].m_Begin=sp-m_mBegin;
+ break;
+ case CSop::opRightParen:
+ i = m_Strip[ss].m_Operand;
+ ASSERT(0<i && i<=m_Subexps);
+ m_Matches[i].m_End=sp-m_mBegin;
+ break;
+#ifdef _DEBUG
+ default: // Uh.. oh..
+ ASSERT(FALSE);
+ break;
+#endif
+ }
+ }
+ ASSERT(sp==end);
+ return sp;
+}
+
+LPCTSTR CRegEx::MatchBackRef(LPCTSTR begin,LPCTSTR end,int from,int to,int level)
+{
+LPCTSTR sp = begin;
+BOOL hard = FALSE;
+ // Get as far as we can as long as it's easy
+ for(int ss=from;!hard && ss<to;ss++){
+ CSop s = m_Strip[ss];
+ switch(s.m_Operator){
+ case CSop::opChar:
+ if(sp==end || *sp++ != s.m_Operand)
+ return NULL;
+ break;
+ case CSop::opAny:
+ if(sp==end)
+ return NULL;
+ sp++; // I'm sure this ++ could be embedded in previous expression..
+ break;
+ case CSop::opAnyOf:
+ if(sp==end || !m_Sets[s.m_Operand].IsIn(*sp++))
+ return NULL;
+ break;
+ case CSop::opBOL:
+ if(!((sp==m_mBegin && !(m_mFlags®NotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags®NewLine))))
+ return NULL;
+ break;
+ case CSop::opEOL:
+ if(!((sp==m_mEnd && !(m_mFlags®NotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags®NewLine))))
+ return NULL;
+ break;
+ case CSop::opBOW:
+ if(!(((sp==m_mBegin && !(m_mFlags®NotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags®NewLine)) || (sp>m_mBegin && !isWordableChar(*(sp-1)))) && (sp<m_mEnd && isWordableChar(*sp))))
+ return NULL;
+ break;
+ case CSop::opEOW:
+ if(!(((sp==m_mEnd && !(m_mFlags®NotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags®NewLine)) || (sp<m_mEnd && !isWordableChar(*sp))) && (sp>m_mBegin && isWordableChar(*(sp-1)))))
+ return NULL;
+ break;
+ case CSop::opQuest1:
+ break;
+ case CSop::opOr0: // Matches null, but needs to skip
+ ss++;
+ s = m_Strip[ss];
+ do{
+ ASSERT(s.m_Operator==CSop::opOr1);
+ ss+=s.m_Operand;
+ }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1);
+ // Now we should note that ss++ gets us past the Choice1..
+ break;
+ default:
+ // Have to make a choice..
+ hard=TRUE;
+ break;
+ }
+ }
+ if(!hard){ // That was it..
+ if(sp!=end)
+ return NULL;
+ return sp;
+ }
+ ss--; // Adjust for ther for's final increment..
+ // Hard stuff.. is going on and on..
+CSop s = m_Strip[ss];
+int i, len, offsave;
+int ssub,esub;
+LPCTSTR ssp, dp;
+ switch(s.m_Operator){
+ case CSop::opBackRef0: // The vilest depths they say.. If I only knew what the 'viles' stands for..
+ i = s.m_Operand;
+ ASSERT(0<i && i<=m_Subexps);
+ if(m_Matches[i].m_End<0)
+ return NULL;
+ ASSERT(m_Matches[i].m_Begin>=0);
+ len = m_Matches[i].GetLength();
+ ASSERT((end-m_mBegin)>=len);
+ if(sp>end-len)
+ return NULL; // Not enough left to match..
+ ssp = m_mBegin+m_Matches[i].m_Begin;
+ if(memcmp(sp,ssp,len))
+ return NULL;
+ while(m_Strip[ss]!=CSop(CSop::opBackRef1,i))
+ ss++;
+ return MatchBackRef(sp+len,end,ss+1,to,level-1);
+ break;
+ case CSop::opQuest0: // to null or not they wonder..
+ dp = MatchBackRef(sp,end,ss+1,to,level);
+ if(dp)
+ return dp; // not..
+ return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1);
+ break;
+ case CSop::opPlus0:
+ ASSERT(m_mLastPos.GetSize());
+ ASSERT(level+1 <= m_Pluses);
+ m_mLastPos[level+1]=sp;
+ return MatchBackRef(sp,end,ss+1,to,level+1);
+ break;
+ case CSop::opPlus1:
+ if(sp == m_mLastPos[level]) // Last pass matched null
+ return MatchBackRef(sp,end,ss+1,to,level-1);
+ // Try another pass..
+ m_mLastPos[level]=sp;
+ dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level);
+ if(dp)
+ return dp;
+ return MatchBackRef(sp,end,ss+1,to,level-1);
+ break;
+ case CSop::opChoice0: // find the right one, ifany
+ ssub = ss+1;
+ esub = ss+s.m_Operand-1;
+ ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
+ for(;;){ // Find first matching branch.
+ dp = MatchBackRef(sp,end,ssub,esub,level);
+ if(dp)
+ return dp;
+ // This one missed, try next one..
+ if(m_Strip[esub].m_Operator==CSop::opChoice1)
+ return NULL; // There is none..
+ esub++;
+ ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
+ ssub=esub+1;
+ esub+=m_Strip[esub].m_Operand;
+ if(m_Strip[esub].m_Operator==CSop::opOr1)
+ esub--;
+ else
+ ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
+ }
+ break;
+ case CSop::opLeftParen: // Must undo assignment if rest fails..
+ i = s.m_Operand;
+ ASSERT(0<i && i<=m_Subexps);
+ offsave = m_Matches[i].m_Begin;
+ m_Matches[i].m_Begin = sp-m_mBegin;
+ dp = MatchBackRef(sp,end,ss+1,to,level);
+ if(dp)
+ return dp;
+ m_Matches[i].m_Begin=offsave;
+ return NULL;
+ break;
+ case CSop::opRightParen: // Must undo assignment if rest fails..
+ i = s.m_Operand;
+ ASSERT(0<i && i<=m_Subexps);
+ offsave = m_Matches[i].m_End;
+ m_Matches[i].m_End = sp-m_mBegin;
+ dp = MatchBackRef(sp,end,ss+1,to,level);
+ if(dp)
+ return dp;
+ m_Matches[i].m_End = offsave;
+ return NULL;
+ break;
+#ifdef _DEBUG
+ default:
+ ASSERT(FALSE);
+ break;
+#endif
+ }
+ ASSERT(FALSE);
+ return NULL; // Anyway - we can never get here..
+}
+
+#ifdef _DEBUG
+void CRegEx::CSop::Dump(CDumpContext& dc)
+{
+ switch(m_Operator){
+ case opEnd:
+ dc << "end";
+ break;
+ case opChar:
+ dc << "char('" << (char)m_Operand << "')";
+ break;
+ case opBOL:
+ dc << "BOL";
+ break;
+ case opEOL:
+ dc << "EOL";
+ break;
+ case opAny:
+ dc << "any";
+ break;
+ case opAnyOf:
+ dc << "anyOf(" << m_Operand << ")";
+ break;
+ case opBackRef0:
+ dc << "[ backref(" << m_Operand << ")";
+ break;
+ case opBackRef1:
+ dc << "] backref(" << m_Operand << ")";
+ break;
+ case opPlus0:
+ dc << "[ + (" << m_Operand << ")";
+ break;
+ case opPlus1:
+ dc << "] + (" << m_Operand << ")";
+ break;
+ case opQuest0:
+ dc << "[ ? (" << m_Operand << ")";
+ break;
+ case opQuest1:
+ dc << "] ? (" << m_Operand << ")";
+ break;
+ case opLeftParen:
+ dc << "[ ( (" << m_Operand << ")";
+ break;
+ case opRightParen:
+ dc << "] ) (" << m_Operand << ")";
+ break;
+ case opChoice0:
+ dc << "[ choice (" << m_Operand << ")";
+ break;
+ case opOr0:
+ dc << "[ | (" << m_Operand << ")";
+ break;
+ case opOr1:
+ dc << "] | (" << m_Operand << ")";
+ break;
+ case opChoice1:
+ dc << "] choice (" << m_Operand << ")";
+ break;
+ case opBOW:
+ dc << "BOW";
+ break;
+ case opEOW:
+ dc << "EOW";
+ break;
+ }
+}
+void CRegEx::DumpStrip(CDumpContext& dc)
+{
+ for(int tmp=0;tmp<m_Strip.GetSize();tmp++)
+ dc << tmp << ": " << m_Strip[tmp] << ";\n";
+}
+#endif
+
+
+CString CRegEx::GetMatch(int match)
+{
+CString rv;
+ if(!m_Matches.GetSize())
+ return rv;
+ ASSERT(m_Matches[0].m_Begin<m_Input.GetLength() && m_Matches[0].m_End<=m_Input.GetLength());
+ if(match==matchPreMatch)
+ return m_Input.Left(m_Matches[0].m_Begin);
+ if(match==matchPostMatch)
+ return m_Input.Mid(m_Matches[0].m_End);
+ if(match<0 || match >= m_Matches.GetSize())
+ return rv;
+ if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0)
+ return rv;
+ ASSERT(m_Matches[match].m_Begin<m_Input.GetLength() && m_Matches[match].m_End<=m_Input.GetLength());
+ rv = m_Input.Mid(m_Matches[match].m_Begin,m_Matches[match].m_End-m_Matches[match].m_Begin);
+ return rv;
+}
|