3 files changed, 49 insertions, 12 deletions
diff --git a/lib/data.cc b/lib/data.cc
index c040430..f71788f 100644
--- a/lib/data.cc
+++ b/lib/data.cc
@@ -1,11 +1,14 @@
 #include <opkele/data.h>
 
 namespace opkele {
 
     namespace data {
 
 	const char *_default_p = "155172898181473697471232257763715539915724801966915404479707795314057629378541917580651227423698188993727816152646631438561595825688188889951272158842675419950341258706556549803580104870537681476726513255747040765857479291291572334510643245094715007229621094194349783925984760375594985848253359305585439638443";
 	const char *_default_g = "2";
 
+	const char *_iname_leaders = "=@+$!(";
+	const char *_whitespace_chars = " \t\r\n";
+
     }
 }
diff --git a/lib/discovery.cc b/lib/discovery.cc
index b7f2db6..5913ad4 100644
--- a/lib/discovery.cc
+++ b/lib/discovery.cc
@@ -1,49 +1,48 @@
 #include <list>
 #include <opkele/curl.h>
 #include <opkele/expat.h>
 #include <opkele/uris.h>
 #include <opkele/discovery.h>
 #include <opkele/exception.h>
 #include <opkele/util.h>
 #include <opkele/tidy.h>
+#include <opkele/data.h>
 #include <opkele/debug.h>
 
 #include "config.h"
 
 #define XRDS_HEADER "X-XRDS-Location"
 #define CT_HEADER "Content-Type"
 
 namespace opkele {
     using std::list;
     using xrd::XRD_t;
     using xrd::service_t;
 
     /* TODO: the whole discovery thing needs cleanup and optimization due to
      * many changes of concept. */
 
-    static const char *whitespace = " \t\r\n";
-    static const char *i_leaders = "=@+$!(";
     static const size_t max_html = 16384;
 
     static const struct service_type_t {
 	const char *uri;
 	const char *forceid;
     } op_service_types[] = {
 	{ STURI_OPENID20_OP, IDURI_SELECT20 },
 	{ STURI_OPENID20, 0 },
 	{ STURI_OPENID11, 0 },
 	{ STURI_OPENID10, 0 }
     };
     enum {
 	st_index_1 = 2, st_index_2 = 1
     };
 
 
     static inline bool is_qelement(const XML_Char *n,const char *qen) {
 	return !strcasecmp(n,qen);
     }
     static inline bool is_element(const XML_Char *n,const char *en) {
 	if(!strcasecmp(n,en)) return true;
 	int nl = strlen(n), enl = strlen(en);
 	if( (nl>=(enl+1)) && n[nl-enl-1]=='\t'
 		&& !strcasecmp(&n[nl-enl],en) )
@@ -107,60 +106,60 @@ namespace opkele {
 			|| (r=set_header())
 			;
 		    if(r)
 			throw exception_curl(OPKELE_CP_ "failed to set curly options",r);
 		}
 	    ~idigger_t() throw() { }
 
 	    void yadiscover(endpoint_discovery_iterator oi,const string& yurl,const char **types,bool redirs) {
 		idiscovery_t idis;
 		idis.xri_identity = false;
 		discover_at(idis,yurl,xmode_html|xmode_xrd|(redirs?0:xmode_noredirs));
 		if(!xrds_location.empty()) {
 		    idis.clear();
 		    discover_at(idis,xrds_location,xmode_xrd);
 		}
 		idis.normalized_id = idis.canonicalized_id = yurl;
 		service_type_t st;
 		for(st.uri=*types;*types;st.uri=*(++types))
 		    queue_endpoints(oi,idis,&st);
 	    }
 
 	    string discover(endpoint_discovery_iterator& oi,const string& identity) {
 		string rv;
 		idiscovery_t idis;
-		string::size_type fsc = identity.find_first_not_of(whitespace);
+		string::size_type fsc = identity.find_first_not_of(data::_whitespace_chars);
 		if(fsc==string::npos)
 		    throw bad_input(OPKELE_CP_ "whitespace-only identity");
-		string::size_type lsc = identity.find_last_not_of(whitespace);
+		string::size_type lsc = identity.find_last_not_of(data::_whitespace_chars);
 		assert(lsc!=string::npos);
 		if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1))
 		    fsc += sizeof("xri://")-1;
 		if((fsc+1)>=lsc)
 		    throw bad_input(OPKELE_CP_ "not a character of importance in identity");
 		string id(identity,fsc,lsc-fsc+1);
 		idis.clear();
-		if(strchr(i_leaders,id[0])) {
+		if(strchr(data::_iname_leaders,id[0])) {
 		    /* TODO: further normalize xri identity? Like folding case
 		     * or whatever... */
 		    rv = id;
 		    set<string> cids;
 		    for(const struct service_type_t *st=op_service_types;
 			    st<&op_service_types[sizeof(op_service_types)/sizeof(*op_service_types)];++st) {
 			idis.clear();
 			discover_at( idis,
 				xri_proxy + util::url_encode(id)+
 				"?_xrd_t="+util::url_encode(st->uri)+
 				"&_xrd_r=application/xrd%2Bxml"
 				";sep=true;refs=true",
 				xmode_xrd );
 			if(status_code==241) continue;
 			if(status_code!=100)
 			    throw failed_xri_resolution(OPKELE_CP_
 				    "XRI resolution failed with '"+status_string+"' message"
 				    ", while looking for SEP with type '"+st->uri+"'", status_code);
 			if(idis.xrd.canonical_ids.empty())
 			    throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found");
 			string cid = idis.xrd.canonical_ids.begin()->second;
 			if(cids.find(cid)==cids.end()) {
 			    cids.insert(cid);
 			    idis.clear();
@@ -474,55 +473,55 @@ namespace opkele {
 
 	    void html_start_element(const XML_Char *n,const XML_Char **a) {
 		if(is_element(n,"meta")) {
 		    bool heq = false;
 		    string l;
 		    for(;*a;a+=2) {
 			if(!( strcasecmp(a[0],"http-equiv")
 				|| strcasecmp(a[1],XRDS_HEADER) ))
 			    heq = true;
 			else if(!strcasecmp(a[0],"content"))
 			    l.assign(a[1]);
 		    }
 		    if(heq)
 			xrds_location = l;
 		}else if(is_element(n,"link")) {
 		    string rels;
 		    string href;
 		    for(;*a;a+=2) {
 			if( !strcasecmp(a[0],"rel") ) {
 			    rels.assign(a[1]);
 			}else if( !strcasecmp(a[0],"href") ) {
 			    const char *ns = a[1];
 			    for(;*ns && isspace(*ns);++ns);
 			    href.assign(ns);
-			    string::size_type lns=href.find_last_not_of(whitespace);
+			    string::size_type lns=href.find_last_not_of(data::_whitespace_chars);
 			    href.erase(lns+1);
 			}
 		    }
-		    for(string::size_type ns=rels.find_first_not_of(whitespace);
-			    ns!=string::npos; ns=rels.find_first_not_of(whitespace,ns)) {
-			string::size_type s = rels.find_first_of(whitespace,ns);
+		    for(string::size_type ns=rels.find_first_not_of(data::_whitespace_chars);
+			    ns!=string::npos; ns=rels.find_first_not_of(data::_whitespace_chars,ns)) {
+			string::size_type s = rels.find_first_of(data::_whitespace_chars,ns);
 			string rel;
 			if(s==string::npos) {
 			    rel.assign(rels,ns,string::npos);
 			    ns = string::npos;
 			}else{
 			    rel.assign(rels,ns,s-ns);
 			    ns = s;
 			}
 			if(rel=="openid.server")
 			    html_openid1.uris.add(-1,xrd::uri_t(href));
 			else if(rel=="openid.delegate")
 			    html_openid1.local_ids.add(-1,href);
 			else if(rel=="openid2.provider")
 			    html_openid2.uris.add(-1,xrd::uri_t(href));
 			else if(rel=="openid2.local_id")
 			    html_openid2.local_ids.add(-1,href);
 		    }
 		}else if(is_element(n,"body")) {
 		    skipping = -1;
 		}
 	    }
 
 	    void queue_endpoints(endpoint_discovery_iterator& oi,
 		    const idiscovery_t &id,
diff --git a/lib/util.cc b/lib/util.cc
index bb8a2e8..29e6738 100644
--- a/lib/util.cc
+++ b/lib/util.cc
@@ -1,39 +1,40 @@
 #include <errno.h>
 #include <cassert>
 #include <cctype>
 #include <cstring>
 #include <vector>
 #include <string>
 #include <stack>
 #include <algorithm>
 #include <openssl/bio.h>
 #include <openssl/evp.h>
 #include <openssl/sha.h>
 #include <openssl/hmac.h>
 #include <curl/curl.h>
 #include <opkele/util.h>
 #include <opkele/exception.h>
+#include <opkele/data.h>
 #include <opkele/debug.h>
 
 #include <config.h>
 #ifdef HAVE_DEMANGLE
 # include <cxxabi.h>
 #endif
 
 namespace opkele {
     using namespace std;
 
     namespace util {
 
 	/*
 	 * base64
 	 */
 	string encode_base64(const void *data,size_t length) {
 	    BIO *b64 = 0, *bmem = 0;
 	    try {
 		b64 = BIO_new(BIO_f_base64());
 		if(!b64)
 		    throw exception_openssl(OPKELE_CP_ "failed to BIO_new() base64 encoder");
 		BIO_set_flags(b64,BIO_FLAGS_BASE64_NO_NL);
 		bmem = BIO_new(BIO_s_mem());
 		BIO_set_flags(b64,BIO_CLOSE);
@@ -190,61 +191,60 @@ namespace opkele {
 		throw failed_conversion(OPKELE_CP_ "failed to snprintf()");
 	    return rv;
 	}
 
 	long string_to_long(const string& s) {
 	    char *endptr = 0;
 	    long rv = strtol(s.c_str(),&endptr,10);
 	    if((!endptr) || endptr==s.c_str())
 		throw failed_conversion(OPKELE_CP_ "failed to strtol()");
 	    return rv;
 	}
 
 	/*
 	 * Normalize URL according to the rules, described in rfc 3986, section 6
 	 *
 	 * - uppercase hex triplets (e.g. %ab -> %AB)
 	 * - lowercase scheme and host
 	 * - decode %-encoded characters, specified as unreserved in rfc 3986, section 2.3,
 	 *   that is - [:alpha:][:digit:]._~-
 	 * - remove dot segments
 	 * - remove empty and default ports
 	 * - if there's no path component, add '/'
 	 */
 	 string rfc_3986_normalize_uri(const string& uri) {
-	     static const char *whitespace = " \t\r\n";
 	     string rv;
-	     string::size_type ns = uri.find_first_not_of(whitespace);
+	     string::size_type ns = uri.find_first_not_of(data::_whitespace_chars);
 	     if(ns==string::npos)
 		 throw bad_input(OPKELE_CP_ "Can't normalize empty URI");
 	     string::size_type colon = uri.find(':',ns);
 	     if(colon==string::npos)
 		 throw bad_input(OPKELE_CP_ "No scheme specified in URI");
 	     transform(
 		     uri.begin()+ns, uri.begin()+colon+1,
 		     back_inserter(rv), ::tolower );
 	     bool s;
-	     string::size_type ul = uri.find_last_not_of(whitespace)+1;
+	     string::size_type ul = uri.find_last_not_of(data::_whitespace_chars)+1;
 	     if(ul <= (colon+3))
 		 throw bad_input(OPKELE_CP_ "Unexpected end of URI being normalized encountered");
 	     if(uri[colon+1]!='/' || uri[colon+2]!='/')
 		 throw bad_input(OPKELE_CP_ "Unexpected input in URI being normalized after scheme component");
 	     if(rv=="http:")
 		 s = false;
 	     else if(rv=="https:")
 		 s = true;
 	     else{
 		 /* TODO: support more schemes.  e.g. xri. How do we normalize
 		  * xri?
 		  */
 		 rv.append(uri,colon+1,ul-colon-1);
 		 return rv;
 	     }
 	     rv += "//";
 	     string::size_type interesting = uri.find_first_of(":/#?",colon+3);
 	     if(interesting==string::npos) {
 		 transform(
 			 uri.begin()+colon+3,uri.begin()+ul,
 			 back_inserter(rv), ::tolower );
 		 rv += '/'; return rv;
 	     }
 	     transform(
@@ -409,27 +409,62 @@ namespace opkele {
 		kv += ':';
 		kv += om.get_field(f);
 		kv += '\n';
 		if(co==string::npos) break;
 		p = co+1;
 	    }
 	    const secret_t& secret = assoc->secret();
 	    const EVP_MD *evpmd;
 	    const string& at = assoc->assoc_type();
 	    if(at=="HMAC-SHA256")
 		evpmd = EVP_sha256();
 	    else if(at=="HMAC-SHA1")
 		evpmd = EVP_sha1();
 	    else
 		throw unsupported(OPKELE_CP_ "unknown association type");
 	    unsigned int md_len = 0;
 	    unsigned char md[SHA256_DIGEST_LENGTH];
 	    HMAC(evpmd,
 		    &(secret.front()),secret.size(),
 		    (const unsigned char*)kv.data(),kv.length(),
 		    md,&md_len);
 	    return encode_base64(md,md_len);
 	}
 
+	string normalize_identifier(const string& usi,bool strip_fragment) {
+	    if(usi.empty())
+		return usi;
+	    string rv;
+	    string::size_type fsc = usi.find_first_not_of(data::_whitespace_chars);
+	    if(fsc==string::npos)
+		return rv;
+	    string::size_type lsc = usi.find_last_not_of(data::_whitespace_chars);
+	    assert(lsc!=string::npos);
+	    if(!strncasecmp(usi.c_str()+fsc,"xri://",sizeof("xri://")-1))
+		fsc += sizeof("xri://")-1;
+	    if( (fsc+1) >= lsc )
+		return rv;
+	    rv.assign(usi,fsc,lsc-fsc+1);
+	    if(strchr(data::_iname_leaders,rv[0])) {
+		/* TODO: further normalize xri identity, fold case or
+		 * whatever... */
+	    }else{
+		if(rv.find("://")==string::npos)
+		    rv.insert(0,"http://");
+		if(strip_fragment) {
+		    string::size_type fp = rv.find('#');
+		    if(fp!=string::npos) {
+			string::size_type qp = rv.find('?');
+			if(qp==string::npos || qp<fp)
+			    rv.erase(fp);
+			else if(qp>fp)
+			    rv.erase(fp,qp-fp);
+		    }
+		}
+		rv = rfc_3986_normalize_uri(rv);
+	    }
+	    return rv;
+	}
+
     }
 
 }