author | Michael Krelin <hacker@klever.net> | 2007-11-23 22:18:54 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2007-11-24 07:09:40 (UTC) |
commit | 70f85314fcd188a182aae3a4291c0cd95ba16ee2 (patch) (side-by-side diff) | |
tree | cad4bf188542d4319280a3c30757e72ec2d20bf6 | |
parent | 2589c69c4a909563098365fba141082db4657353 (diff) | |
download | libopkele-70f85314fcd188a182aae3a4291c0cd95ba16ee2.zip libopkele-70f85314fcd188a182aae3a4291c0cd95ba16ee2.tar.gz libopkele-70f85314fcd188a182aae3a4291c0cd95ba16ee2.tar.bz2 |
added URI normalization procedure to opkele::util
as specified in RFC3896, section 6
Signed-off-by: Michael Krelin <hacker@klever.net>
-rw-r--r-- | include/opkele/util.h | 10 | ||||
-rw-r--r-- | lib/util.cc | 131 | ||||
-rw-r--r-- | test/test.cc | 67 |
3 files changed, 208 insertions, 0 deletions
diff --git a/include/opkele/util.h b/include/opkele/util.h index edc1859..085c9e6 100644 --- a/include/opkele/util.h +++ b/include/opkele/util.h @@ -128,2 +128,12 @@ namespace opkele { void decode_base64(const string& data,vector<unsigned char>& rv); + + /** + * Normalize http(s) URI according to RFC3986, section 6. URI is + * expected to have scheme: in front of it. + * @param uri URI + * @return normalized URI + * @throw not_implemented in case of non-httpi(s) URI + * @throw bad_input in case of malformed URI + */ + string rfc_3986_normalize_uri(const string& uri); } diff --git a/lib/util.cc b/lib/util.cc index 26be66a..eacf6d7 100644 --- a/lib/util.cc +++ b/lib/util.cc @@ -2,2 +2,3 @@ #include <cassert> +#include <cctype> #include <cstring> @@ -5,2 +6,3 @@ #include <string> +#include <stack> #include <openssl/bio.h> @@ -161,2 +163,131 @@ namespace opkele { + /* + * Normalize URL according to the rules, described in rfc 3986, section 6 + * + * - uppercase hext triplets (e.g. %ab -> %AB) + * - lowercase scheme and host + * - decode %-encoded characters, specified as unreserved in rfc 3986, section 2.3, + * that is - [:alpha:][:digit:]._~- + * - remove dot segments + * - remove empty and default ports + * - if there's no path component, add '/' + */ + string rfc_3986_normalize_uri(const string& uri) { + string rv; + string::size_type colon = uri.find(':'); + if(colon==string::npos) + throw bad_input(OPKELE_CP_ "No scheme specified in URI"); + transform( + uri.begin(), uri.begin()+colon+1, + back_inserter(rv), ::tolower ); + bool s; + if(rv=="http:") + s = false; + else if(rv=="https:") + s = true; + else + throw not_implemented(OPKELE_CP_ "Only http(s) URIs can be normalized here"); + string::size_type ul = uri.length(); + if(ul <= (colon+3)) + throw bad_input(OPKELE_CP_ "Unexpected end of URI being normalized encountered"); + if(uri[colon+1]!='/' || uri[colon+2]!='/') + throw bad_input(OPKELE_CP_ "Unexpected input in URI being normalized after scheme component"); + rv += "//"; + string::size_type interesting = uri.find_first_of(":/#?",colon+3); + if(interesting==string::npos) { + transform( + uri.begin()+colon+3,uri.end(), + back_inserter(rv), ::tolower ); + rv += '/'; return rv; + } + transform( + uri.begin()+colon+3,uri.begin()+interesting, + back_inserter(rv), ::tolower ); + bool qf = false; + char ic = uri[interesting]; + if(ic==':') { + string::size_type ni = uri.find_first_of("/#?%",interesting+1); + const char *nptr = uri.data()+interesting+1; + char *eptr = 0; + long port = strtol(nptr,&eptr,10); + if( (port>0) && (port<65535) && port!=(s?443:80) ) { + char tmp[6]; + snprintf(tmp,sizeof(tmp),"%d",port); + rv += ':'; rv += tmp; + } + if(ni==string::npos) { + rv += '/'; return rv; + } + interesting = ni; + }else if(ic!='/') { + rv += '/'; rv += ic; + qf = true; + ++interesting; + } + string::size_type n = interesting; + char tmp[3] = { 0,0,0 }; + stack<string::size_type> psegs; psegs.push(rv.length()); + string pseg; + for(;n<ul;) { + string::size_type unsafe = uri.find_first_of(qf?"%":"%/?#",n); + if(unsafe==string::npos) { + pseg.append(uri,n,ul-n-1); n = ul-1; + }else{ + pseg.append(uri,n,unsafe-n); + n = unsafe; + } + char c = uri[n++]; + if(c=='%') { + if((n+1)>=ul) + throw bad_input(OPKELE_CP_ "Unexpected end of URI encountered while parsing percent-encoded character"); + tmp[0] = uri[n++]; + tmp[1] = uri[n++]; + if(!( isxdigit(tmp[0]) && isxdigit(tmp[1]) )) + throw bad_input(OPKELE_CP_ "Invalid percent-encoded character in URI being normalized"); + int cc = strtol(tmp,0,16); + if( isalpha(cc) || isdigit(cc) || strchr("._~-",cc) ) + pseg += cc; + else{ + pseg += '%'; + pseg += toupper(tmp[0]); pseg += toupper(tmp[1]); + } + }else if(qf) { + rv += pseg; rv += c; + pseg.clear(); + }else if(n>=ul || strchr("?/#",c)) { + if(pseg.empty() || pseg==".") { + }else if(pseg=="..") { + if(psegs.size()>1) { + rv.resize(psegs.top()); psegs.pop(); + } + }else{ + psegs.push(rv.length()); + if(c!='/') { + pseg += c; + qf = true; + } + rv += '/'; rv += pseg; + } + if(c=='/' && (n>=ul || strchr("?#",uri[n])) ) { + rv += '/'; + if(n<ul) + qf = true; + }else if(strchr("?#",c)) { + if(psegs.size()==1 && psegs.top()==rv.length()) + rv += '/'; + if(pseg.empty()) + rv += c; + qf = true; + } + pseg.clear(); + }else{ + pseg += c; + } + } + if(!pseg.empty()) { + rv += '/'; rv += pseg; + } + return rv; + } + } diff --git a/test/test.cc b/test/test.cc index f92284c..1a012b5 100644 --- a/test/test.cc +++ b/test/test.cc @@ -5,2 +5,3 @@ using namespace std; #include <opkele/consumer.h> +#include <opkele/util.h> @@ -61,4 +62,70 @@ void test_retrieve_links() { +void test_rfc_3986_normalize_uri(const string &ouri,bool success,const string& nuri="") { + try { + string n = opkele::util::rfc_3986_normalize_uri(ouri); + if(!success) + throw failed_test(OPKELE_CP_ "Normalized URI when it shouldn't"); + if(n!=nuri) + throw failed_test(OPKELE_CP_ "rfc_3986_test_failed for '"+ouri+"' failed, expected '"+nuri+"', got '"+n+"'"); + }catch(opkele::bad_input& obi) { + if(success) + throw failed_test(OPKELE_CP_ "Test '"+ouri+"' failed due to 'bad_input'["+obi.what()+"]"); + }catch(opkele::not_implemented& oni) { + if(success) + throw failed_test(OPKELE_CP_ "Test '"+ouri+"' failed due to 'not_implemented'["+oni.what()+"]"); + } +} + +void test_rfc_3986_normalize_uri() { + test_rfc_3986_normalize_uri( + "invalid", false ); + test_rfc_3986_normalize_uri( + "ftp://hacker.klever.net/", false ); + test_rfc_3986_normalize_uri( + "http://", false ); + test_rfc_3986_normalize_uri( + "http:/hacker.klever.net/", false ); + test_rfc_3986_normalize_uri( + "hTTp://hacker.klever.net#uh?oh", true, "http://hacker.klever.net/#uh?oh" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net?uh#oh", true, "http://hacker.klever.net/?uh#oh" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net:80/", true, "http://hacker.klever.net/" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net:80?uh", true, "http://hacker.klever.net/?uh" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net:80#uh", true, "http://hacker.klever.net/#uh" ); + test_rfc_3986_normalize_uri( + "https://hacker.klever.net:443", true, "https://hacker.klever.net/" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net:?oh", true, "http://hacker.klever.net/?oh" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah%2E", true, "http://hacker.klever.net/ah." ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%2E/", true, "http://hacker.klever.net/ah/" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%2b/", true, "http://hacker.klever.net/ah/%2B/" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/./oh?eh", true, "http://hacker.klever.net/ah/oh?eh" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/../oh?", true, "http://hacker.klever.net/oh?" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah//oh?", true, "http://hacker.klever.net/ah/oh?" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/?", true, "http://hacker.klever.net/ah/?" ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%", false ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%a", false ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%zx", false ); + test_rfc_3986_normalize_uri( + "http://hacker.klever.net/ah/%5x", false ); + test_rfc_3986_normalize_uri( + "Http://Hacker.Klever.Net:", true, "http://hacker.klever.net/" ); +} + int main() { try { + test_rfc_3986_normalize_uri(); test_retrieve_links(); |