summaryrefslogtreecommitdiffabout
path: root/shared-code/RegEx.cpp
authorMichael Krelin <hacker@klever.net>2012-12-08 21:19:17 (UTC)
committer Michael Krelin <hacker@klever.net>2012-12-11 21:59:29 (UTC)
commit8808689fe340bec6e90ab13dd502292b0579cf1f (patch) (side-by-side diff)
tree45b7c863151341f687b74e40bffcbd7ab5468783 /shared-code/RegEx.cpp
parent6e7e413ca364d79673e523c09767c18e7cff1bec (diff)
downloadpumpkin-osx/0.0.zip
pumpkin-osx/0.0.tar.gz
pumpkin-osx/0.0.tar.bz2
initial osx portosx/0.0
Signed-off-by: Michael Krelin <hacker@klever.net>
Diffstat (limited to 'shared-code/RegEx.cpp') (more/less context) (ignore whitespace changes)
-rw-r--r--shared-code/RegEx.cpp1697
1 files changed, 0 insertions, 1697 deletions
diff --git a/shared-code/RegEx.cpp b/shared-code/RegEx.cpp
deleted file mode 100644
index b7bab62..0000000
--- a/shared-code/RegEx.cpp
+++ b/dev/null
@@ -1,1697 +0,0 @@
-#include "../stdafx.h"
-#include "RegEx.h"
-
-#define isWordableChar(c) (isalnum(c) || (c)=='_')
-
-BOOL CRegEx::Compile(LPCTSTR regex,int flags)
-{
- ASSERT(!((flags&regExtended) && (flags&regLiteral)));
- m_Matches.RemoveAll();
- m_Strip.RemoveAll();
- m_Strip.SetSize(0,15);
- m_Pattern=regex;
- m_ParsePointer=0;
- m_Error=0;
- m_Sets.RemoveAll();
- m_Flags=flags;
- m_iFlags=0;
- m_BOLs=m_EOLs=0;
- m_Subexps = 0;
- m_Categories=1; // 0 is 'everything else'
- m_bBackRefs=FALSE;
- memset(m_Category,0,sizeof(m_Category));
-
- // Go ahead.
- m_Error || m_Strip.Add(CSop(CSop::opEnd));
- if(flags&regExtended){
- ParseERE();
- }else if(flags&regLiteral){
- ParseLiteral();
- }else{
- ParseBRE();
- }
- m_Error || m_Strip.Add(CSop(CSop::opEnd));
- Categorize();
- m_Strip.FreeExtra();
- FigureMust();
- m_Pluses=CountPluses();
- if(m_iFlags&iflagsBad){
- m_Error || (m_Error=regeAssert);
- // ??? point to nuls? ;-)
- }
- // We may wish to free some memory here if we're erroneous (ie. m_Error..)
- m_ParseParens.RemoveAll();
-#ifdef _DEBUG
- if(m_Error){
- CString tmp;
- tmp.Format("RE: ParseError: %d\n",m_Error);
- TRACE0(tmp);
- }
-// DumpStrip(afxDump);
-#endif
- return (m_bCompiled=(!m_Error));
-}
-
-BOOL CRegEx::Match(LPCTSTR src,int flags)
-{
- if(!m_bCompiled)
- return FALSE;
- if(m_iFlags&iflagsBad)
- return FALSE;
- m_Input=src;
- m_mFlags=flags;
- m_mPointer=m_Input;
- m_mBegin=m_Input;
- m_mEnd=&m_mBegin[m_Input.GetLength()];
- ASSERT(m_mPointer<=m_mEnd);
- m_Matches.RemoveAll();
- if(!m_Must.IsEmpty()){
- if(m_Input.Find(m_Must)<0)
- return FALSE;
- }
- // Go ahead..
-int stripLen = m_Strip.GetSize();
- m_mLastPos.SetSize(0);
- for(int tmp=0;tmp<stripLen;tmp++)
- m_Strip[tmp].m_MatchData=0;
-LPCTSTR beginp = m_mBegin;
-LPCTSTR endp;
- for(;;){
- endp = MatchFast(beginp);
- if(!endp)
- return FALSE;
- if((m_mFlags&regNoSubExpressions) && !m_bBackRefs)
- break;
- ASSERT(m_cOldP);
- for(;;){
- endp = MatchSlow(m_cOldP,m_mEnd,1,stripLen-1);
- if(endp)
- break;
- ASSERT(m_cOldP<m_mEnd);
- m_cOldP++;
- }
- if((m_mFlags&regOneMatch) && !m_bBackRefs)
- break;
- // Oh, his, we want the subexpression..
- m_Matches.SetSize(m_Subexps+1);
- LPCTSTR dp;
- if(!m_bBackRefs && !(m_mFlags&regBackRefs)){
- dp = MatchDissect(m_cOldP,endp,1,stripLen-1);
- }else{
- if(m_Pluses>0 && !m_mLastPos.GetSize())
- m_mLastPos.SetSize(m_Pluses);
- dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
- }
- if(dp)
- break;
- // Uh.. oh.. we couldn't find a subexpression-level match
- ASSERT(m_bBackRefs);
- ASSERT(m_Pluses==0 || m_mLastPos.GetSize());
- for(;;){
- if(dp || endp <= m_cOldP)
- break; // defeat.. ?
- endp = MatchSlow(m_cOldP,endp-1,1,stripLen-1);
- if(!endp)
- break; // defeat.. ?
- // Try it on a shorter possibility..
-#ifdef _DEBUG
- for(tmp=1;tmp<=m_Subexps;tmp++)
- ASSERT(m_Matches[tmp].m_Begin<0 && m_Matches[tmp].m_End<0);
-#endif
- dp = MatchBackRef(m_cOldP,endp,1,stripLen-1,0);
- }
- ASSERT((!dp) || dp==endp);
- if(dp) // Found a shorter one..
- break;
- // Despite initial appearances, there is no match here
- beginp = m_cOldP+1;
- ASSERT(beginp<=m_mEnd);
- }
- // Fill in the detail if so requested..
- if(!(m_mFlags&regNoSubExpressions)){
- if(!m_Matches.GetSize())
- m_Matches.SetSize(1);
- m_Matches[0].m_Begin=m_cOldP-m_mBegin;
- m_Matches[0].m_End=endp-m_mBegin;
- }
- m_mLastPos.RemoveAll();
- return TRUE;
-}
-
-CString CRegEx::Replace(LPCTSTR src,LPCTSTR rep,int flags)
-{
- // ***
- return CString();
-}
-
-void CRegEx::ParseERE(int stop)
-{
-UCHAR c;
-BOOL first=TRUE;
-int prevF, prevB;
- for(;;){
- int co = m_Strip.GetSize();
- while(m_ParsePointer < m_Pattern.GetLength() && ((c=m_Pattern[m_ParsePointer])!='|') && c!=stop)
- ParseEREexp();
- if(m_Strip.GetSize()==co){
- m_Error || (m_Error=regeEmpty);
- // ??? point to nuls?
- }
- if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='|')
- break;
- else
- m_ParsePointer++;
- if(first){
- StripInsert(co,CSop(CSop::opChoice0,m_Strip.GetSize()-co+1));
- prevF = prevB = co;
- first=FALSE;
- }
- m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-prevB));
- prevB = m_Strip.GetSize()-1;
- m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
- prevF = m_Strip.GetSize();
- m_Error || m_Strip.Add(CSop(CSop::opOr1,0)); // offset isn't really right.. very so..
- }
- if(!first){
- m_Error || (m_Strip[prevF].m_Operand=m_Strip.GetSize()-prevF);
- m_Error || m_Strip.Add(CSop(CSop::opChoice1,m_Strip.GetSize()-prevB));
- }
- ASSERT(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]==stop);
-}
-
-void CRegEx::ParseEREexp()
-{
- ASSERT(m_ParsePointer < m_Pattern.GetLength());
-UCHAR c = m_Pattern[m_ParsePointer++];
-int pos = m_Strip.GetSize();
-int subno;
-int count, count2;
-BOOL wascaret=FALSE;
- switch(c){
- case '(':
- if(!(m_ParsePointer<m_Pattern.GetLength())){
- TRACE0("RE: '(' at the end of the pattern\n");
- if(!m_Error)
- m_Error = regeParen;
- // ??? point to nuls?
- }
- m_Subexps++;
- subno=m_Subexps;
- m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
- m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
- if(m_ParsePointer>=m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!=')')
- ParseERE(')');
- VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
- m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
- if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer++]!=')'){
- TRACE0("RE: No matching ')'\n");
- if(!m_Error)
- m_Error = regeParen;
- // ??? point to nuls?
- }
- break;
- case '^':
- m_Error || m_Strip.Add(CSop(CSop::opBOL));
- m_iFlags|=iflagsUseBOL;
- m_BOLs++;
- wascaret=TRUE;
- break;
- case '$':
- m_Error || m_Strip.Add(CSop(CSop::opEOL));
- m_iFlags|=iflagsUseEOL;
- m_EOLs++;
- break;
- case '|':
- TRACE0("RE: '|' outside of expression\n");
- if(!m_Error)
- m_Error = regeEmpty;
- // ??? point to nuls?
- break;
- case '*':
- case '+':
- case '?':
- TRACE0("RE: '*'/'+'/'?' with no previous expression\n");
- if(!m_Error)
- m_Error = regeBadRepeat;
- // ??? point to nuls?
- break;
- case '.':
- if(m_Flags&regNewLine)
- EmitNonNewLineAny();
- else
- m_Error || m_Strip.Add(CSop(CSop::opAny));
- break;
- case '[':
- ParseBracket();
- break;
- case '\\':
- if(m_ParsePointer >= m_Pattern.GetLength()){
- TRACE0("RE: '\\' at the end of the pattern\n");
- if(!m_Error)
- m_Error = regeEscape;
- // ??? point to nuls?
- }else{
- c = m_Pattern[m_ParsePointer++];
- EmitOrdinary(c);
- }
- break;
- case '{':
- if(m_ParsePointer >= m_Pattern.GetLength() || !isdigit(m_Pattern[m_ParsePointer])){
- TRACE0("RE: '{' with no repeat count\n");
- if(!m_Error)
- m_Error = regeBadRepeat;
- // ??? point to nuls?
- }
- // Fallthrough..
- default:
- EmitOrdinary(c);
- break;
- }
- if(m_ParsePointer >= m_Pattern.GetLength())
- return;
- c = m_Pattern[m_ParsePointer];
- // Call a '{' repetition if followed by a digit
- if (!(c=='*' || c=='+' || c=='?' || ( c=='{' && (m_ParsePointer+1) < m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1])) ))
- return; // No repetitor - done.
- m_ParsePointer++;
- if(wascaret){
- TRACE0("RE: repetitive '^' detected\n");
- if(!m_Error)
- m_Error = regeBadRepeat;
- // ??? point to nuls?
- }
- switch(c){
- case '*': // Implemeted as +?
- // + expression
- StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
- // ? expression
- StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
- break;
- case '+':
- // + expression
- StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
- break;
- case '?':
- // Kludge - emit y? as (y|) until subtle bug gets fixed :-)
- StripInsert(pos,CSop(CSop::opChoice0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
- m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
- m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
- m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
- break;
- case '{':
- count = ParseCount();
- if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
- m_ParsePointer++;
- if(isdigit(m_Pattern[m_ParsePointer])){ // HHH Personally, I doubt it is always available
- count2=ParseCount();
- if(!(count<=count2)){
- TRACE0("RE: Disbalanced counts in '{}' repeater\n");
- m_Error || (m_Error=regeBadBrace);
- // ??? point to nuls?
- }
- }else // Single number with comma
- count2=256; // Infinity
- }else // Single number
- count2=count;
- EmitRepeat(pos,count,count2);
- if(m_ParsePointer >= m_Pattern.GetLength() || m_Pattern[m_ParsePointer]!='}'){
- // No '}'..
- TRACE0("RE: No immediately following '}' of '{' expression\n");
- while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!='}')
- m_ParsePointer++;
- if(m_ParsePointer >= m_Pattern.GetLength()){
- TRACE0("RE: No closing '}' found\n");
- m_Error || (m_Error=regeBrace);
- }else
- m_Error || (m_Error=regeBadBrace);
- // ??? point to nuls?
- }else
- m_ParsePointer++;
- break;
- }
- if(m_ParsePointer >= m_Pattern.GetLength())
- return;
- c = m_Pattern[m_ParsePointer];
- if(!(c=='*' || c=='+' || c=='?' || (c=='{' && (m_ParsePointer+1)<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer+1]))))
- return;
- TRACE0("RE: Double repeater\n");
- m_Error || (m_Error=regeBadRepeat);
- // ??? point to nuls?
-}
-
-void CRegEx::StripInsert(int pos,CSop& sop)
-{
- if(m_Error)
- return;
-int sn = m_Strip.GetSize();
- m_Strip.InsertAt(pos,sop);
- for(int tmp=1;tmp<m_ParseParens.GetSize();tmp++){
- if(m_ParseParens[tmp].m_Begin>=pos)
- m_ParseParens[tmp].m_Begin++;
- if(m_ParseParens[tmp].m_End>=pos)
- m_ParseParens[tmp].m_End++;
- }
-}
-
-void CRegEx::EmitOrdinary(UCHAR c)
-{
- if(m_Flags&regIgnoreCase && isalpha(c) && (tolower(c) !=toupper(c))){
- // Emit both cases
- CString savePattern = m_Pattern;
- int savePointer = m_ParsePointer;
- m_Pattern=c;
- m_Pattern+=']';
- m_ParsePointer=0;
- ParseBracket();
- m_Pattern=savePattern;
- m_ParsePointer=savePointer;
- }else{
- m_Error || m_Strip.Add(CSop(CSop::opChar,c));
- if(!m_Category[(BYTE)c])
- m_Category[(BYTE)c]=m_Categories++;
- }
-}
-
-void CRegEx::EmitNonNewLineAny()
-{
- // Kludges're going on and on..
-CString savePattern = m_Pattern;
-int savePointer = m_ParsePointer;
- m_Pattern="^\n]";
- m_ParsePointer=0;
- ParseBracket();
- m_Pattern=savePattern;
- m_ParsePointer=savePointer;
-}
-
-int CRegEx::ParseCount()
-{
-BOOL nonEmpty=FALSE;
-int rv = 0;
-UCHAR c;
- while(m_ParsePointer < m_Pattern.GetLength() && isdigit(c=m_Pattern[m_ParsePointer]) && rv <=255){
- rv = rv*10 + c-'0';
- nonEmpty=TRUE;
- m_ParsePointer++;
- }
- if(rv>255 || !nonEmpty){
- m_Error || (m_Error=regeBadBrace);
- // ??? point to nuls?
- }
- return rv;
-}
-
-void CRegEx::ParseBracket()
-{
- // Dept. of truly sickening special case kludges
- if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:<]]")){
- m_Error || m_Strip.Add(CSop(CSop::opBOW));
- m_ParsePointer+=6;
- return;
- }
- if((m_ParsePointer+5) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,6).Compare("[:>]]")){
- m_Error || m_Strip.Add(CSop(CSop::opEOW));
- m_ParsePointer+=6;
- return;
- }
-BOOL invert=TRUE;
- if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^')
- m_ParsePointer++;
- else
- invert=FALSE;
-CSet cset;
- if(m_ParsePointer < m_Pattern.GetLength()){
- switch(m_Pattern[m_ParsePointer]){
- case ']':
- case '-':
- cset.Add(m_Pattern[m_ParsePointer]);
- m_ParsePointer++;
- break;
- }
- }
- while(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]!=']' && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("-]")))
- ParseBracketTerm(cset);
- if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
- m_ParsePointer++;
- cset.Add('-');
- }
- if(m_ParsePointer < m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==']')
- m_ParsePointer++;
- else{
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- return;
- }
- if(m_Flags&regIgnoreCase){
- for(int tmp=CSet::size-1;tmp>=0;tmp--){
- if(cset.IsIn(tmp) && isalpha(tmp) && (toupper(tmp)!=tolower(tmp)))
- cset.Add(isupper(tmp)?tolower(tmp):toupper(tmp));
- }
- /*
- if(!cset->m_Multis.IsEmpty())
- cset.CollatingCase();
- */
- }
- if(invert){
- for(int tmp=CSet::size-1;tmp>=0;tmp--)
- if(cset.IsIn(tmp))
- cset.Sub(tmp);
- else
- cset.Add(tmp);
- if(m_Flags&regNewLine)
- cset.Sub('\n');
- /*
- if(!cset.m_Multis.IsEmpty())
- cset.CollatingInvert();
- */
- }
-UCHAR c = cset.GetOnly();
- if(c){
- EmitOrdinary(c);
- }else
- m_Error || m_Strip.Add(CSop(CSop::opAnyOf,StoreSet(cset)));
-}
-
-void CRegEx::CSet::Add(UCHAR c)
-{
- m_Set[(BYTE)c]=TRUE;
- m_Hash+=c;
-}
-
-BOOL CRegEx::CSet::IsIn(UCHAR c)
-{
- return m_Set[(BYTE)c];
-}
-
-void CRegEx::CSet::Sub(UCHAR c)
-{
- m_Set[(BYTE)c]=FALSE;
- m_Hash-=c;
-}
-
-UCHAR CRegEx::CSet::GetOnly()
-{
-int rv = 0;
-UCHAR only = 0;
- for(int tmp=0;tmp<size;tmp++){
- rv+=m_Set[tmp]?(only=tmp,1):0;
- }
- return (rv==1)?only:0;
-}
-
-int CRegEx::StoreSet(CSet& cset)
-{
- for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
- if(m_Sets[tmp]==cset)
- return tmp;
- return m_Sets.Add(cset);
-}
-
-void CRegEx::ParseBracketTerm(CSet& cset)
-{
-UCHAR c;
- switch((m_ParsePointer<m_Pattern.GetLength())?m_Pattern[m_ParsePointer]:0){
- case '[':
- c = ((m_ParsePointer+1)<m_Pattern.GetLength())?m_Pattern[m_ParsePointer+1]:0;
- break;
- case '-':
- m_Error || (m_Error=regeRange);
- // ??? point to nuls?
- return;
- default:
- c = 0;
- break;
- }
- switch(c){
- case ':': // Character class
- m_ParsePointer+=2;
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- }
- c = m_Pattern[m_ParsePointer];
- if(c== '-' || c==']'){
- m_Error || (m_Error=regeCType);
- // ??? point to nuls?
- }
- ParseBracketCClass(cset);
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- }
- if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare(":]"))){
- m_Error || (m_Error=regeCType);
- // ??? point to nuls?
- }else
- m_ParsePointer+=2;
- break;
- case '=': // Equivalence class
- m_ParsePointer+=2;
- if(m_ParsePointer >= m_Pattern.GetLength()){
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- }
- c = m_Pattern[m_ParsePointer];
- if(c== '-' || c==']'){
- m_Error || (m_Error=regeCollate);
- // ??? point to nuls?
- }
- ParseBracketEClass(cset);
- if((m_ParsePointer+1)>=m_Pattern.GetLength() || (m_Pattern.Mid(m_ParsePointer,2).Compare("=]"))){
- m_Error || (m_Error=regeCollate);
- // ??? point to nuls?
- }else
- m_ParsePointer+=2;
- break;
- default: // Symbol, character or range
- {
- UCHAR start, finish;
- start = ParseBracketSymbol();
- if((m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-') /*&& (m_ParsePointer+1)<m_Pattern.GetLength() && m_Pattern[m_ParsePointer+1]==']'*/){
- // I believe the expression above is seetwo..
- // range.
- m_ParsePointer++;
- if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='-'){
- m_ParsePointer++;
- finish='-';
- }else
- finish=ParseBracketSymbol();
- }else
- finish=start;
- if(((BYTE)start)>((BYTE)finish)){
- m_Error || (m_Error=regeRange);
- // ??? point to nuls?
- }
- for(int tmp=start;tmp<=(BYTE)finish;tmp++)
- cset.Add(tmp);
- }
- break;
- }
-}
-
-void CRegEx::ParseBracketCClass(CSet& cset)
-{
-static struct {
- char *className;
- char *classChars;
-} cc[] = {
- {"alnum","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"},
- {"alpha","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"},
- {"blank"," \t"},
- {"cntrl","\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177"},
- {"digit","0123456789"},
- {"graph","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
- {"lower","abcdefghijklmnopqrstuvwxyz"},
- {"print","ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ "},
- {"punct","!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"},
- {"space","\t\n\v\f\r "},
- {"upper","ABCDEFGHIJKLMNOPQRSTUVWXYZ"},
- {"xdigit","0123456789ABCDEFabcdef"}
-};
-CString cclass;
-UCHAR c;
- while(m_ParsePointer < m_Pattern.GetLength() && isalpha(c=m_Pattern[m_ParsePointer])){
- cclass+=c;
- m_ParsePointer++;
- }
-char *classChars = NULL;
- for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++){
- if(!cclass.CompareNoCase(cc[tmp].className)){
- classChars=cc[tmp].classChars;
- break;
- }
- }
- if(!classChars){
- m_Error || (m_Error=regeCType);
- // ??? point to nuls?
- return;
- }
- while(*classChars)
- cset.Add(*(classChars++));
- // --- multis
-}
-
-void CRegEx::ParseBracketEClass(CSet& cset)
-{
- cset.Add(ParseBracketCollatingElement('='));;
-}
-
-UCHAR CRegEx::ParseBracketCollatingElement(UCHAR term)
-{
-static struct {
- char *entityName;
- char entity;
-} cc[] = { {"NUL",'\0'},{"SOH",'\001'},{"STX",'\002'},{"ETX",'\003'},{"EOT",'\004'},{"ENQ",'\005'},{"ACK",'\006'},{"BEL",'\007'},{"alert",'\007'},{"BS",'\010'},{"backspace",'\b'},{"HT",'\011'},{"tab",'\t'},{"LF",'\012'},{"newline",'\n'},{"VT",'\013'},{"vertical-tab",'\v'},{"FF",'\014'},{"form-feed",'\f'},{"CR",'\015'},{"carriage-return",'\r'},{"SO",'\016'},{"SI",'\017'},{"DLE",'\020'},{"DC1",'\021'},{"DC2",'\022'},{"DC3",'\023'},{"DC4",'\024'},{"NAK",'\025'},{"SYN",'\026'},{"ETB",'\027'},{"CAN",'\030'},{"EM",'\031'},{"SUB",'\032'},{"ESC",'\033'},{"IS4",'\034'},{"FS",'\034'},{"IS3",'\035'},{"GS",'\035'},{"IS2",'\036'},{"RS",'\036'},{"IS1",'\037'},{"US",'\037'},{"space",' '},{"exclamation-mark",'!'},{"quotation-mark",'"'},{"number-sign",'#'},{"dollar-sign",'$'},{"percent-sign",'%'},{"ampersand",'&'},{"apostrophe",'\''},{"left-parenthesis",'('},{"right-parenthesis",')'},{"asterisk",'*'},{"plus-sign",'+'},{"comma",','},{"hyphen",'-'},{"hyphen-minus",'-'},{"period",'.'},{"full-stop",'.'},{"slash",'/'},{"solidus",'/'},{"zero",'0'},{"one",'1'},{"two",'2'},{"three",'3'},{"four",'4'},{"five",'5'},{"six",'6'},{"seven",'7'},{"eight",'8'},{"nine",'9'},{"colon",':'},{"semicolon",';'},{"less-than-sign",'<'},{"equals-sign",'='},{"greater-than-sign",'>'},{"question-mark",'?'},{"commercial-at",'@'},{"left-square-bracket",'['},{"backslash",'\\'},{"reverse-solidus",'\\'},{"right-square-bracket",']'},{"circumflex",'^'},{"circumflex-accent",'^'},{"underscore",'_'},{"low-line",'_'},{"grave-accent",'`'},{"left-brace",'{'},{"left-curly-bracket",'{'},{"vertical-line",'|'},{"right-brace",'}'},{"right-curly-bracket",'}'},{"tilde",'~'},{"DEL",'\177'} };
-CString seeTwo;
- seeTwo=term;
- seeTwo+=']';
-CString entityName;
- while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(seeTwo)))
- entityName+=m_Pattern[m_ParsePointer++];
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- return 0;
- }
- for(int tmp=0;tmp<(sizeof(cc)/sizeof(cc[0]));tmp++)
- if(!entityName.CompareNoCase(cc[tmp].entityName))
- return cc[tmp].entity;
- if(entityName.GetLength()==1)
- return entityName[0];
- m_Error || (m_Error=regeCollate);
- // ??? point to nuls?
- return 0;
-}
-
-UCHAR CRegEx::ParseBracketSymbol()
-{
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeBracket);
- // ??? point to nuls?
- }
- if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
- m_ParsePointer+=2;
- else
- return m_Pattern[m_ParsePointer++];
- // Collating symbol
-UCHAR rv = ParseBracketCollatingElement('.');
- if((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("[."))
- m_ParsePointer+=2;
- else{
- m_Error || (m_Error=regeCollate);
- // ??? point to nuls?
- }
- return rv;
-}
-
-void CRegEx::EmitRepeat(int pos,int from,int to)
-{
- if(m_Error)
- return;
- ASSERT(from<=to);
-int finish = m_Strip.GetSize();
-int copy;
-#define N 2
-#define INF 3
-#define REP(f,t) ((f)*8+(t))
-#define MAP(n) (((n)<=1)?(n):((n)==256)?INF:N)
- switch(REP(MAP(from),MAP(to))){
- case REP(0,0): // must be user doing ths??
- m_Strip.SetSize(pos);
- break;
- case REP(0,1): // as in '?'
- case REP(0,N): // as in '{1,n}?'
- case REP(0,INF): // as in '{1,}?'
- // Kludge - emit y? as (y|) until something gets fixed..
- StripInsert(pos,CSop(CSop::opChoice0,pos));
- EmitRepeat(pos+1,1,to);
- m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
- m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
- m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
- m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
- break;
- case REP(1,1):
- break;
- case REP(1,N): // as in 'x?x{1,n-1}'
- // Kludge again..
- StripInsert(pos,CSop(CSop::opChoice0,pos));
- m_Error || m_Strip.Add(CSop(CSop::opOr0,m_Strip.GetSize()-pos));
- m_Error || (m_Strip[pos].m_Operand=m_Strip.GetSize()-pos);
- m_Error || m_Strip.Add(CSop(CSop::opOr1,1));
- m_Error || m_Strip.Add(CSop(CSop::opChoice1,2));
- copy = StripDuplicate(pos+1,finish+1);
- ASSERT(copy==(finish+4));
- EmitRepeat(copy,1,to-1);
- break;
- case REP(1,INF): // as in '+'
- StripInsert(pos,CSop(CSop::opPlus0,pos));
- m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
- break;
- case REP(N,N): // as in 'xx{from-1,to-1}'
- copy = StripDuplicate(pos,finish);
- EmitRepeat(copy,from-1,to-1);
- break;
- case REP(N,INF): // as in 'xx{n-1,}'
- copy = StripDuplicate(pos,finish);
- EmitRepeat(copy,from-1,to);
- break;
-#ifndef NDEBUG
- default:
- ASSERT(FALSE);
- break;
-#endif
- }
-#undef MAP
-#undef REP
-#undef INF
-#undef N
-}
-
-int CRegEx::StripDuplicate(int from,int to)
-{
-int rv = m_Strip.GetSize();
- ASSERT(from<=to);
- if(from==to)
- return rv;
- // Maybe should be optimized for copying the whole thing.
- for(int tmp=from;tmp<to;tmp++)
- m_Strip.Add(m_Strip[tmp]);
- return rv;
-}
-
-void CRegEx::Categorize()
-{
- if(m_Error)
- return;
- for(int tmp=0;tmp<(sizeof(m_Category)/sizeof(m_Category[0]));tmp++)
- if((!m_Category[tmp]) && IsInSets(tmp)){
- int cat = m_Categories++;
- m_Category[tmp]=cat;
- for(int c=tmp+1;c<(sizeof(m_Category)/sizeof(m_Category[0]));c++)
- if((!m_Category[c]) && IsInSameSets(tmp,c))
- m_Category[c]=cat;
- }
-}
-
-BOOL CRegEx::IsInSets(UCHAR c)
-{
- for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
- if(m_Sets[tmp].IsIn(c))
- return TRUE;
- return FALSE;
-}
-
-BOOL CRegEx::IsInSameSets(UCHAR c1,UCHAR c2)
-{
- for(int tmp=0;tmp<m_Sets.GetSize();tmp++)
- if(m_Sets[tmp].IsIn(c1)!=m_Sets[tmp].IsIn(c2))
- return FALSE;
- return TRUE;
-}
-
-void CRegEx::FigureMust()
-{
- if(m_Error)
- return;
- m_Must.Empty();
-int stripLen = m_Strip.GetSize();
-int seqStart, seqLength = 0;
-int mustStart, mustLength = 0;
- for(int tmp=1;tmp<stripLen;tmp++){
- switch(m_Strip[tmp].m_Operator){
- case CSop::opChar:
- if(!seqLength)
- seqStart=tmp;
- seqLength++;
- break;
- case CSop::opPlus0:
- case CSop::opLeftParen:
- case CSop::opRightParen:
- break; // Break, since they don't break the sequence
- case CSop::opQuest0:
- case CSop::opChoice0:
- // These ones we skip.
- do{
- tmp+=m_Strip[tmp].m_Operand;
- // I still think it could be ASSERTed..
- if(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1 && m_Strip[tmp].m_Operator!=CSop::opOr1){
- m_iFlags|=iflagsBad;
- return;
- }
- }while(m_Strip[tmp].m_Operator!=CSop::opQuest1 && m_Strip[tmp].m_Operator!=CSop::opChoice1);
- // Fallthrough..
- default:
- // End of sequence
- if(seqLength>mustLength){
- mustStart=seqStart;
- mustLength=seqLength;
- }
- seqLength=0;
- break;
- }
- } // Hmm.. originally it's meant to be do while not opEnd..
- if(!mustLength)
- return;
- // Turn into string, but, wait, personally I'm sure it could be put in the main loop.. or maybe not..
- for(tmp=0;tmp<seqLength;tmp++){
- while(m_Strip[seqStart+tmp].m_Operator!=CSop::opChar)
- ASSERT(tmp<seqLength);
- m_Must+=m_Strip[tmp].m_Operand;
- }
-}
-
-int CRegEx::CountPluses()
-{
- if(m_Error)
- return 0;
-int stripLen = m_Strip.GetSize();
-int rv = 0;
-int nest = 0;
- for(int tmp=0;tmp<stripLen;tmp++){
- switch(m_Strip[tmp].m_Operator){
- case CSop::opPlus0:
- nest++;
- break;
- case CSop::opPlus1:
- if(nest>rv)
- rv = nest;
- nest--;
- break;
- }
- } // Again, originally we were supposed to scan till opEnd..
- if(nest)
- m_iFlags|=iflagsBad; // Could this be an ASSERTion?
- return rv;
-}
-
-void CRegEx::ParseLiteral()
-{
- if(!m_Pattern.GetLength()){
- m_Error || (m_Error=regeEmpty);
- // ??? point to nuls?
- }
- while(m_ParsePointer < m_Pattern.GetLength())
- EmitOrdinary(m_Pattern[m_ParsePointer++]);
-}
-
-void CRegEx::ParseBRE(int stopa,int stopb)
-{
-int start = m_Strip.GetSize();
-BOOL first=TRUE;
-BOOL wasDollar=FALSE;
- if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='^'){
- m_ParsePointer++;
- m_Error || m_Strip.Add(CSop(CSop::opBOL,0));
- m_iFlags|=iflagsUseBOL;
- m_BOLs++;
- }
-CString stopstr;
- if(stopa){
- stopstr+=stopa;
- if(stopb)
- stopstr+=stopb;
- }
- while(m_ParsePointer < m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare(stopstr))){
- wasDollar = ParseBREexp(first);
- first=FALSE;
- }
- if(wasDollar){ // Trailing anchor that was..
- m_Strip.SetSize(m_Strip.GetSize()-1);
- m_Error || m_Strip.Add(CSop(CSop::opEOL,0));
- m_iFlags|=iflagsUseEOL;
- m_EOLs++;
- }
- if(m_Strip.GetSize()==start){
- m_Error || (m_Error=regeEmpty);
- // ??? point to nuls?
- }
-}
-
-BOOL CRegEx::ParseBREexp(BOOL ordinaryStar)
-{
-int subno;
-int pos = m_Strip.GetSize();
- ASSERT(m_ParsePointer<m_Pattern.GetLength());
-int c = m_Pattern[m_ParsePointer++];
- if(c=='\\'){
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeEscape);
- // ??? point to nuls
- }else
- c = 0x100|m_Pattern[m_ParsePointer++];
- }
- switch(c){
- case '.':
- if(m_Flags&regNewLine)
- EmitNonNewLineAny();
- else
- m_Error || m_Strip.Add(CSop(CSop::opAny,0));
- break;
- case '[':
- ParseBracket();
- break;
- case 0x100|'{':
- m_Error || (m_Error=regeBadRepeat);
- // ??? point to nuls?
- break;
- case 0x100|'(':
- m_Subexps++;
- subno=m_Subexps;
- m_ParseParens.SetAtGrow(m_Subexps,CParenthesis(m_Strip.GetSize()));
- m_Error || m_Strip.Add(CSop(CSop::opLeftParen,subno));
- if(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)")))
- ParseBRE('\\',')');
- VERIFY(m_ParseParens[m_Subexps].m_End = m_Strip.GetSize());
- m_Error || m_Strip.Add(CSop(CSop::opRightParen,subno));
- if((m_ParsePointer+1) < m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\)"))
- m_ParsePointer+=2;
- else{
- m_Error || (m_Error=regeParen);
- // ??? point to nuls?
- }
- break;
- case 0x100|')':
- case 0x100|'}':
- // Can this possibly happen?!
- m_Error || (m_Error=regeParen);
- // ??? point to nuls?
- break;
- case 0x100|'1':
- case 0x100|'2':
- case 0x100|'3':
- case 0x100|'4':
- case 0x100|'5':
- case 0x100|'6':
- case 0x100|'7':
- case 0x100|'8':
- case 0x100|'9':
- {
- int i = (c&0xFF)-'0';
- if(i < m_ParseParens.GetSize() && m_ParseParens[i].m_End){
- m_Error || m_Strip.Add(CSop(CSop::opBackRef0,i));
- ASSERT(m_ParseParens[i].m_Begin);
- ASSERT(m_Strip[m_ParseParens[i].m_Begin].m_Operator==CSop::opLeftParen);
- ASSERT(m_Strip[m_ParseParens[i].m_End].m_Operator==CSop::opRightParen);
- StripDuplicate(m_ParseParens[i].m_Begin+1,m_ParseParens[i].m_End);
- m_Error || m_Strip.Add(CSop(CSop::opBackRef1,i));
- }else{
- m_Error || (m_Error=regeSubReg);
- // ??? point to nuls?
- }
- m_bBackRefs=TRUE;
- }
- break;
- case '*':
- if(!ordinaryStar){
- m_Error || (m_Error=regeBadRepeat);
- // ??? point to nuls?
- }
- // Fallthrough..
- default:
- EmitOrdinary(c&0xFF);
- break;
- }
- if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]=='*'){
- m_ParsePointer++;
- // as in '+?'
- StripInsert(pos,CSop(CSop::opPlus0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opPlus1,m_Strip.GetSize()-pos));
- StripInsert(pos,CSop(CSop::opQuest0,m_Strip.GetSize()-pos+1));
- m_Error || m_Strip.Add(CSop(CSop::opQuest1,m_Strip.GetSize()-pos));
- }else if ((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\{")){
- m_ParsePointer+=2;
- int count = ParseCount();
- int count2;
- if(m_ParsePointer<m_Pattern.GetLength() && m_Pattern[m_ParsePointer]==','){
- m_ParsePointer++;
- if(m_ParsePointer<m_Pattern.GetLength() && isdigit(m_Pattern[m_ParsePointer])){
- count2=ParseCount();
- if(count>count2){
- m_Error || (m_Error=regeBadBrace);
- // ??? poin to nuls?
- }
- }else // Single number with comma
- count2=256;
- }else // Single number
- count2=count;
- EmitRepeat(pos,count,count2);
- if((m_ParsePointer+1)>=m_Pattern.GetLength() || m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")){
- while(m_ParsePointer<m_Pattern.GetLength() && !((m_ParsePointer+1)<m_Pattern.GetLength() && !m_Pattern.Mid(m_ParsePointer,2).Compare("\\}")))
- m_ParsePointer++;
- if(m_ParsePointer>=m_Pattern.GetLength()){
- m_Error || (m_Error=regeBrace);
- // ??? point to nuls?
- }
- m_Error || (m_Error=regeBadBrace);
- }else
- m_ParsePointer+=2;
- }else if(c=='$')
- return TRUE;
- return FALSE;
-}
-
-CRegEx::CRegEx()
-{
- m_bCompiled=FALSE;
-}
-
-LPCTSTR CRegEx::MatchFast(LPCTSTR begin)
-{
- MatchStatesClear(CSop::stCurrent);
- m_Strip[1].m_MatchData|=CSop::stCurrent;
-int stripLen = m_Strip.GetSize();
- MatchStep(1,stripLen-1,CSop::stCurrent,charNothing,CSop::stCurrent);
- MatchStatesCopy(CSop::stFresh,CSop::stCurrent);
-LPCTSTR coldp = NULL;
-LPCTSTR p = begin;
-int c = (begin==m_mBegin)?charOut:((int)(BYTE)m_mPointer[-1]);
- for(;;){
- // next character..
- int lastc = c;
- c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
- if(MatchStatesEqual(CSop::stCurrent,CSop::stFresh))
- coldp=p;
- // Is there an EOL and/or BOL between lastc and c? - they ask..
- int flagc=0;
- int i = 0;
- if((lastc=='\n' && m_Flags&regNewLine) || (lastc==charOut && !(m_mFlags&regNotBOL))){
- flagc=charBOL;
- i=m_BOLs;
- }
- if((c=='\n' && m_Flags&regNewLine) || (c==charOut && !(m_mFlags&regNotEOL))){
- flagc=(flagc==charBOL)?charBOLEOL:charEOL;
- i+=m_EOLs;
- }
- if(i){
- for(;i>0;i--)
- MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
- }
- // What about a word boundary? - they wonder..
- if((flagc==charBOL || (lastc!=charOut && !isWordableChar(c))) && (c!=charOut && isWordableChar(c)))
- flagc = charBOW;
- if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
- flagc = charEOW;
- if(flagc==charBOW || flagc==charEOW){
- MatchStep(1,stripLen-1,CSop::stCurrent,flagc,CSop::stCurrent);
- }
- // Are we done? Now WE wonder..
- if((m_Strip[stripLen-1].m_MatchData&CSop::stCurrent) || p==m_mEnd)
- break; // They insist I need to note break out.. Okay, I do..
- // Nope, we're not done. We have to face this character..
- MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
- MatchStatesCopy(CSop::stCurrent,CSop::stFresh);
- ASSERT(c!=charOut);
- MatchStep(1,stripLen-1,CSop::stTemp,c,CSop::stCurrent);
- p++;
- }
- ASSERT(coldp);
- m_cOldP=coldp; // *** I believe this variable can be changed 'in-place'
- if(m_Strip[stripLen-1].m_MatchData&CSop::stCurrent)
- return &p[1];
- else
- return NULL;
-}
-
-void CRegEx::MatchStatesClear(BYTE mask)
-{
-int stripLen = m_Strip.GetSize();
- for(int tmp=0;tmp<stripLen;tmp++)
- m_Strip[tmp].m_MatchData&=~mask;
-}
-
-void CRegEx::MatchStep(int from,int to,BYTE maskBefore,int charCode,BYTE maskAfter)
-{
-BOOL i;
-int look;
-int here = from;
- for(int pc=from;pc!=to;pc++,here++){
- CSop s=m_Strip[pc];
- switch(s.m_Operator){
- case CSop::opEnd:
- ASSERT(pc==(to-1));
- break;
- case CSop::opChar:
- // Only characters can match..
- ASSERT((charCode<charOut) || charCode!=s.m_Operand);
- if(charCode==s.m_Operand)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opBOL:
- if(charCode==charBOL || charCode==charBOLEOL)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opEOL:
- if(charCode==charEOL || charCode==charBOLEOL)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opBOW:
- if(charCode==charBOW)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opEOW:
- if(charCode==charEOW)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opAny:
- if(charCode<charOut)
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opAnyOf:
- if(charCode<charOut && m_Sets[s.m_Operand].m_Set[charCode])
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskBefore)?maskAfter:0;
- break;
- case CSop::opBackRef0: // Ignored here..
- case CSop::opBackRef1:
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opPlus0:
- // Forward, this is just an empty, comment says..
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opPlus1:
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- i =(m_Strip[here-s.m_Operand].m_MatchData&maskAfter)!=0;
- m_Strip[here-s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- if(!i && (m_Strip[here-s.m_Operand].m_MatchData&maskAfter)){
- // oho, must reconsider loop body, comment says.. what's so 'oho' about it?
- pc-=s.m_Operand+1;
- here=pc;
- }
- break;
- case CSop::opQuest0:
- // two branches, both forward..
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opQuest1:
- // just an empty.. aren't we tired of justanempties?
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opLeftParen: // they say it's not significan there..
- case CSop::opRightParen:
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opChoice0: // mark the first two branches..
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
- m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
- case CSop::opOr0: // done a branch, find the end of choice..
- if(m_Strip[here].m_MatchData&maskAfter){
- for(look=1;(s=m_Strip[pc+look]).m_Operator!=CSop::opChoice1;look+=s.m_Operand)
- ASSERT(s.m_Operator==CSop::opOr1);
- m_Strip[here+look].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- }
- break;
- case CSop::opOr1: // Propagate Choice's marking..
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- if(m_Strip[pc+s.m_Operand].m_Operator!=CSop::opChoice1){
- ASSERT(m_Strip[pc+s.m_Operand].m_Operator==CSop::opOr1);
- m_Strip[here+s.m_Operand].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- }
- break;
- case CSop::opChoice1: // Just empty.. :-)..
- m_Strip[here+1].m_MatchData|=(m_Strip[here].m_MatchData&maskAfter)?maskAfter:0;
- break;
-#ifdef _DEBUG
- default:
- ASSERT(FALSE);
- break;
-#endif
- }
- }
-}
-
-void CRegEx::MatchStatesCopy(BYTE dst,BYTE src)
-{
-int stripLen = m_Strip.GetSize();
- for(int tmp=0;tmp<stripLen;tmp++){
- // I believe this can be optimized, easily..
- m_Strip[tmp].m_MatchData&=~dst;
- m_Strip[tmp].m_MatchData|=(m_Strip[tmp].m_MatchData&src)?dst:0;
- }
-}
-
-BOOL CRegEx::MatchStatesEqual(BYTE m1,BYTE m2)
-{
-int stripLen = m_Strip.GetSize();
- for(int tmp=0;tmp<stripLen;tmp++){
- BYTE mm = m_Strip[tmp].m_MatchData;
- if(((mm&m1) && (mm&m2)) || !(mm&(m1|m2)))
- continue;
- return FALSE;
- }
- return TRUE;
-}
-
-LPCTSTR CRegEx::MatchSlow(LPCTSTR begin,LPCTSTR end,int from,int to)
-{
- MatchStatesClear(CSop::stCurrent);
- m_Strip[from].m_MatchData|=CSop::stCurrent;
- MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
-LPCTSTR mp = NULL;
-int c = (m_mBegin==m_mPointer)?charOut:(int)(BYTE)begin[-1];
-LPCTSTR p = begin;
- for(;;){
- // next character..
- int lastc = c;
- c = (p==m_mEnd)?charOut:(int)*(BYTE*)p;
- // Now we start to wonder if there is an EOL and/or BOL between lastc and c
- int flagc = 0;
- int i = 0;
- if((lastc=='\n' && m_Flags&regNewLine) || (lastc==charOut && !(m_mFlags&regNotBOL))){
- flagc = charBOL;
- i = m_BOLs;
- }
- if((c=='\n' && m_Flags&regNewLine) || (c==charOut && !(m_mFlags&regNotEOL))){
- flagc = (flagc==charBOL)?charBOLEOL:charEOL;
- i+=m_EOLs;
- }
- if(i){
- for(;i>0;i--)
- MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
- }
- // Now we wonder about word boundaries..
- if((flagc==charBOL || (lastc!=charOut && !isWordableChar(lastc))) && (c!=charOut && isWordableChar(c)))
- flagc=charBOW;
- if((lastc!=charOut && isWordableChar(lastc)) && (flagc==charEOL || (c!=charOut && !isWordableChar(c))))
- flagc=charEOW;
- if(flagc==charBOW || flagc==charEOW){
- MatchStep(from,to,CSop::stCurrent,flagc,CSop::stCurrent);
- }
- // Are we done we all wonder??
- if(m_Strip[to].m_MatchData&CSop::stCurrent)
- mp=p;
- if(MatchStatesEqual(CSop::stCurrent,CSop::stEmpty) || p==end)
- break; // Again, we're obliged to note break out. We do.
- // Sucks.. we have to face this character..
- MatchStatesCopy(CSop::stTemp,CSop::stCurrent);
- MatchStatesCopy(CSop::stCurrent,CSop::stEmpty);
- ASSERT(c!=charOut);
- MatchStep(from,to,CSop::stTemp,c,CSop::stCurrent);
- MatchStep(from,to,CSop::stCurrent,charNothing,CSop::stCurrent);
- p++;
- }
- return mp;
-}
-
-LPCTSTR CRegEx::MatchDissect(LPCTSTR begin,LPCTSTR end,int from,int to)
-{
-LPCTSTR sp = begin, dp;
-LPCTSTR stp, rest, tail, ssp, oldssp, sep;
-int ssub, esub;
-int es;
-int i;
- for(int ss=from;ss<to;ss = es){
- // Identify end of SubRE
- es = ss;
- switch(m_Strip[es].m_Operator){
- case CSop::opPlus0:
- case CSop::opQuest0:
- es+=m_Strip[es].m_Operand;
- break;
- case CSop::opChoice0:
- while(m_Strip[es].m_Operator!=CSop::opChoice1)
- es+=m_Strip[es].m_Operand;
- break;
- }
- es++;
- // Figure out what it matched
- switch(m_Strip[ss].m_Operator){
- case CSop::opEnd:
- ASSERT(FALSE);
- break;
- case CSop::opChar:
- sp++;
- break;
- case CSop::opBOL:
- case CSop::opEOL:
- case CSop::opBOW:
- case CSop::opEOW:
- break;
- case CSop::opAny:
- case CSop::opAnyOf:
- sp++;
- break;
- case CSop::opBackRef0:
- case CSop::opBackRef1:
- ASSERT(FALSE);
- break;
- // Cases where lenght of match is hard to find..
- case CSop::opQuest0:
- stp=end;
- for(;;){
- // How long could this one be??
- rest = MatchSlow(sp,stp,ss,es);
- ASSERT(rest); // It did match.. It should've..
- // Could the rest match the rest? (good question)
- tail = MatchSlow(rest,end,es,to);
- if(tail==end)
- break; // yup.
- // nope, try a shorter match for this one..
- stp=rest-1;
- ASSERT(stp>=sp); // It did work.. It should've..
- }
- ssub=ss+1;
- esub=es-1;
- // Did innards match?
- if(MatchSlow(sp,rest,ssub,esub)){
- dp = MatchDissect(sp,rest,ssub,esub);
- ASSERT(dp==rest);
- }else // nope..
- ASSERT(sp==rest);
- sp = rest;
- break;
- case CSop::opPlus0:
- stp=end;
- for(;;){
- // How long could this one be??
- rest = MatchSlow(sp,stp,ss,es);
- ASSERT(rest); // It did.. It should've..
- // Could the rest match the rest?
- tail = MatchSlow(rest,end,es,to);
- if(tail==end)
- break; // yup.
- // nope..
- stp=rest-1;
- ASSERT(stp>=sp); // It should've worked, we believe..
- }
- ssub=ss+1;
- esub=es-1;
- ssp=sp;
- oldssp=ssp;
- for(;;){ // Find last match of innards..
- sep = MatchSlow(ssp,rest,ssub,esub);
- if((!sep) || sep==ssp)
- break; // Failed or matched nothin'
- oldssp=ssp;
- ssp=sep;
- }
- if(!sep){
- // Last successfull..
- sep=ssp;
- ssp=oldssp;
- }
- ASSERT(sep=rest); // Must exhaust substring they say..
- VERIFY(MatchSlow(ssp,sep,ssub,esub)==rest); // Assert or verify - that is the question..
- dp = MatchDissect(ssp,sep,ssub,esub);
- ASSERT(dp==sep);
- sp=rest;
- break;
- case CSop::opChoice0:
- stp = end;
- for(;;){
- // how long..
- rest = MatchSlow(sp,stp,ss,es);
- ASSERT(rest);
- // Could it..
- tail = MatchSlow(rest,end,es,to);
- if(tail==end)
- break; // y
- // n
- stp = rest-1;
- ASSERT(stp>=sp);
- }
- ssub=ss+1;
- esub=ss+m_Strip[ss].m_Operand-1;
- ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
- for(;;){ // Find first matching branch..
- if(MatchSlow(sp,rest,ssub,esub)==rest)
- break;
- // this one missed, try next..
- ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
- esub++;
- ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
- ssub=esub+1;
- esub+=m_Strip[esub].m_Operand;
- if(m_Strip[esub].m_Operator==CSop::opOr1)
- esub--;
- else
- ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
- }
- dp = MatchDissect(sp,rest,ssub,esub);
- ASSERT(dp==rest);
- sp=rest;
- break;
- case CSop::opPlus1:
- case CSop::opQuest1:
- case CSop::opOr0:
- case CSop::opOr1:
- case CSop::opChoice1:
- ASSERT(FALSE);
- break;
- case CSop::opLeftParen:
- i = m_Strip[ss].m_Operand;
- ASSERT(0<i && i<=m_Subexps);
- m_Matches[i].m_Begin=sp-m_mBegin;
- break;
- case CSop::opRightParen:
- i = m_Strip[ss].m_Operand;
- ASSERT(0<i && i<=m_Subexps);
- m_Matches[i].m_End=sp-m_mBegin;
- break;
-#ifdef _DEBUG
- default: // Uh.. oh..
- ASSERT(FALSE);
- break;
-#endif
- }
- }
- ASSERT(sp==end);
- return sp;
-}
-
-LPCTSTR CRegEx::MatchBackRef(LPCTSTR begin,LPCTSTR end,int from,int to,int level)
-{
-LPCTSTR sp = begin;
-BOOL hard = FALSE;
- // Get as far as we can as long as it's easy
- for(int ss=from;!hard && ss<to;ss++){
- CSop s = m_Strip[ss];
- switch(s.m_Operator){
- case CSop::opChar:
- if(sp==end || *sp++ != s.m_Operand)
- return NULL;
- break;
- case CSop::opAny:
- if(sp==end)
- return NULL;
- sp++; // I'm sure this ++ could be embedded in previous expression..
- break;
- case CSop::opAnyOf:
- if(sp==end || !m_Sets[s.m_Operand].IsIn(*sp++))
- return NULL;
- break;
- case CSop::opBOL:
- if(!((sp==m_mBegin && !(m_mFlags&regNotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags&regNewLine))))
- return NULL;
- break;
- case CSop::opEOL:
- if(!((sp==m_mEnd && !(m_mFlags&regNotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags&regNewLine))))
- return NULL;
- break;
- case CSop::opBOW:
- if(!(((sp==m_mBegin && !(m_mFlags&regNotBOL)) || (sp<m_mEnd && *(sp-1)=='\n' && (m_Flags&regNewLine)) || (sp>m_mBegin && !isWordableChar(*(sp-1)))) && (sp<m_mEnd && isWordableChar(*sp))))
- return NULL;
- break;
- case CSop::opEOW:
- if(!(((sp==m_mEnd && !(m_mFlags&regNotEOL)) || (sp<m_mEnd && *sp=='\n' && (m_Flags&regNewLine)) || (sp<m_mEnd && !isWordableChar(*sp))) && (sp>m_mBegin && isWordableChar(*(sp-1)))))
- return NULL;
- break;
- case CSop::opQuest1:
- break;
- case CSop::opOr0: // Matches null, but needs to skip
- ss++;
- s = m_Strip[ss];
- do{
- ASSERT(s.m_Operator==CSop::opOr1);
- ss+=s.m_Operand;
- }while((s=m_Strip[ss]).m_Operator!=CSop::opChoice1);
- // Now we should note that ss++ gets us past the Choice1..
- break;
- default:
- // Have to make a choice..
- hard=TRUE;
- break;
- }
- }
- if(!hard){ // That was it..
- if(sp!=end)
- return NULL;
- return sp;
- }
- ss--; // Adjust for ther for's final increment..
- // Hard stuff.. is going on and on..
-CSop s = m_Strip[ss];
-int i, len, offsave;
-int ssub,esub;
-LPCTSTR ssp, dp;
- switch(s.m_Operator){
- case CSop::opBackRef0: // The vilest depths they say.. If I only knew what the 'viles' stands for..
- i = s.m_Operand;
- ASSERT(0<i && i<=m_Subexps);
- if(m_Matches[i].m_End<0)
- return NULL;
- ASSERT(m_Matches[i].m_Begin>=0);
- len = m_Matches[i].GetLength();
- ASSERT((end-m_mBegin)>=len);
- if(sp>end-len)
- return NULL; // Not enough left to match..
- ssp = m_mBegin+m_Matches[i].m_Begin;
- if(memcmp(sp,ssp,len))
- return NULL;
- while(m_Strip[ss]!=CSop(CSop::opBackRef1,i))
- ss++;
- return MatchBackRef(sp+len,end,ss+1,to,level-1);
- break;
- case CSop::opQuest0: // to null or not they wonder..
- dp = MatchBackRef(sp,end,ss+1,to,level);
- if(dp)
- return dp; // not..
- return MatchBackRef(sp,end,ss+s.m_Operand+1,to,level-1);
- break;
- case CSop::opPlus0:
- ASSERT(m_mLastPos.GetSize());
- ASSERT(level+1 <= m_Pluses);
- m_mLastPos[level+1]=sp;
- return MatchBackRef(sp,end,ss+1,to,level+1);
- break;
- case CSop::opPlus1:
- if(sp == m_mLastPos[level]) // Last pass matched null
- return MatchBackRef(sp,end,ss+1,to,level-1);
- // Try another pass..
- m_mLastPos[level]=sp;
- dp = MatchBackRef(sp,end,ss-s.m_Operand+1,to,level);
- if(dp)
- return dp;
- return MatchBackRef(sp,end,ss+1,to,level-1);
- break;
- case CSop::opChoice0: // find the right one, ifany
- ssub = ss+1;
- esub = ss+s.m_Operand-1;
- ASSERT(m_Strip[esub].m_Operator==CSop::opOr0);
- for(;;){ // Find first matching branch.
- dp = MatchBackRef(sp,end,ssub,esub,level);
- if(dp)
- return dp;
- // This one missed, try next one..
- if(m_Strip[esub].m_Operator==CSop::opChoice1)
- return NULL; // There is none..
- esub++;
- ASSERT(m_Strip[esub].m_Operator==CSop::opOr1);
- ssub=esub+1;
- esub+=m_Strip[esub].m_Operand;
- if(m_Strip[esub].m_Operator==CSop::opOr1)
- esub--;
- else
- ASSERT(m_Strip[esub].m_Operator==CSop::opChoice1);
- }
- break;
- case CSop::opLeftParen: // Must undo assignment if rest fails..
- i = s.m_Operand;
- ASSERT(0<i && i<=m_Subexps);
- offsave = m_Matches[i].m_Begin;
- m_Matches[i].m_Begin = sp-m_mBegin;
- dp = MatchBackRef(sp,end,ss+1,to,level);
- if(dp)
- return dp;
- m_Matches[i].m_Begin=offsave;
- return NULL;
- break;
- case CSop::opRightParen: // Must undo assignment if rest fails..
- i = s.m_Operand;
- ASSERT(0<i && i<=m_Subexps);
- offsave = m_Matches[i].m_End;
- m_Matches[i].m_End = sp-m_mBegin;
- dp = MatchBackRef(sp,end,ss+1,to,level);
- if(dp)
- return dp;
- m_Matches[i].m_End = offsave;
- return NULL;
- break;
-#ifdef _DEBUG
- default:
- ASSERT(FALSE);
- break;
-#endif
- }
- ASSERT(FALSE);
- return NULL; // Anyway - we can never get here..
-}
-
-#ifdef _DEBUG
-void CRegEx::CSop::Dump(CDumpContext& dc)
-{
- switch(m_Operator){
- case opEnd:
- dc << "end";
- break;
- case opChar:
- dc << "char('" << (char)m_Operand << "')";
- break;
- case opBOL:
- dc << "BOL";
- break;
- case opEOL:
- dc << "EOL";
- break;
- case opAny:
- dc << "any";
- break;
- case opAnyOf:
- dc << "anyOf(" << m_Operand << ")";
- break;
- case opBackRef0:
- dc << "[ backref(" << m_Operand << ")";
- break;
- case opBackRef1:
- dc << "] backref(" << m_Operand << ")";
- break;
- case opPlus0:
- dc << "[ + (" << m_Operand << ")";
- break;
- case opPlus1:
- dc << "] + (" << m_Operand << ")";
- break;
- case opQuest0:
- dc << "[ ? (" << m_Operand << ")";
- break;
- case opQuest1:
- dc << "] ? (" << m_Operand << ")";
- break;
- case opLeftParen:
- dc << "[ ( (" << m_Operand << ")";
- break;
- case opRightParen:
- dc << "] ) (" << m_Operand << ")";
- break;
- case opChoice0:
- dc << "[ choice (" << m_Operand << ")";
- break;
- case opOr0:
- dc << "[ | (" << m_Operand << ")";
- break;
- case opOr1:
- dc << "] | (" << m_Operand << ")";
- break;
- case opChoice1:
- dc << "] choice (" << m_Operand << ")";
- break;
- case opBOW:
- dc << "BOW";
- break;
- case opEOW:
- dc << "EOW";
- break;
- }
-}
-void CRegEx::DumpStrip(CDumpContext& dc)
-{
- for(int tmp=0;tmp<m_Strip.GetSize();tmp++)
- dc << tmp << ": " << m_Strip[tmp] << ";\n";
-}
-#endif
-
-
-CString CRegEx::GetMatch(int match)
-{
-CString rv;
- if(!m_Matches.GetSize())
- return rv;
- ASSERT(m_Matches[0].m_Begin<m_Input.GetLength() && m_Matches[0].m_End<=m_Input.GetLength());
- if(match==matchPreMatch)
- return m_Input.Left(m_Matches[0].m_Begin);
- if(match==matchPostMatch)
- return m_Input.Mid(m_Matches[0].m_End);
- if(match<0 || match >= m_Matches.GetSize())
- return rv;
- if(m_Matches[match].m_Begin<0 || m_Matches[match].m_End<0)
- return rv;
- ASSERT(m_Matches[match].m_Begin<m_Input.GetLength() && m_Matches[match].m_End<=m_Input.GetLength());
- rv = m_Input.Mid(m_Matches[match].m_Begin,m_Matches[match].m_End-m_Matches[match].m_Begin);
- return rv;
-}