summaryrefslogtreecommitdiffabout
path: root/lib/discovery.cc
Side-by-side diff
Diffstat (limited to 'lib/discovery.cc') (more/less context) (ignore whitespace changes)
-rw-r--r--lib/discovery.cc75
1 files changed, 64 insertions, 11 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc
index 8729cfb..a308b56 100644
--- a/lib/discovery.cc
+++ b/lib/discovery.cc
@@ -1,95 +1,99 @@
#include <list>
#include <opkele/curl.h>
#include <opkele/expat.h>
#include <opkele/uris.h>
#include <opkele/discovery.h>
#include <opkele/exception.h>
#include <opkele/util.h>
+#include <opkele/tidy.h>
#include <opkele/debug.h>
#include "config.h"
#define XRDS_HEADER "X-XRDS-Location"
#define CT_HEADER "Content-Type"
namespace opkele {
using std::list;
using xrd::XRD_t;
using xrd::service_t;
static const char *whitespace = " \t\r\n";
static const char *i_leaders = "=@+$!(";
+ static const size_t max_html = 16384;
static inline bool is_qelement(const XML_Char *n,const char *qen) {
return !strcasecmp(n,qen);
}
static inline bool is_element(const XML_Char *n,const char *en) {
if(!strcasecmp(n,en)) return true;
int nl = strlen(n), enl = strlen(en);
if( (nl>=(enl+1)) && n[nl-enl-1]=='\t'
&& !strcasecmp(&n[nl-enl],en) )
return true;
return false;
}
static long element_priority(const XML_Char **a) {
for(;*a;++a)
if(!strcasecmp(*(a++),"priority")) {
long rv;
return (sscanf(*a,"%ld",&rv)==1)?rv:-1;
}
return -1;
}
class idigger_t : public util::curl_t, public util::expat_t {
public:
string xri_proxy;
enum {
xmode_html = 1, xmode_xrd = 2
};
int xmode;
string xrds_location;
string http_content_type;
service_t html_openid1;
service_t html_openid2;
string cdata_buf;
long status_code;
string status_string;
typedef list<string> pt_stack_t;
pt_stack_t pt_stack;
int skipping;
+ bool parser_choked;
+ string save_html;
XRD_t *xrd;
service_t *xrd_service;
string* cdata;
idigger_t()
: util::curl_t(easy_init()),
util::expat_t(0),
xri_proxy(XRI_PROXY_URL) {
CURLcode r;
(r=misc_sets())
|| (r=set_write())
|| (r=set_header())
;
if(r)
throw exception_curl(OPKELE_CP_ "failed to set curly options",r);
}
~idigger_t() throw() { }
void discover(idiscovery_t& result,const string& identity) {
result.clear();
string::size_type fsc = identity.find_first_not_of(whitespace);
if(fsc==string::npos)
throw bad_input(OPKELE_CP_ "whtiespace-only identity");
string::size_type lsc = identity.find_last_not_of(whitespace);
assert(lsc!=string::npos);
if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1))
fsc += sizeof("xri://")-1;
if((fsc+1)>=lsc)
throw bad_input(OPKELE_CP_ "not a character of importance in identity");
string id(identity,fsc,lsc-fsc+1);
if(strchr(i_leaders,id[0])) {
@@ -111,104 +115,153 @@ namespace opkele {
if(id.find("://")==string::npos)
id.insert(0,"http://");
string::size_type fp = id.find('#');
if(fp!=string::npos) {
string::size_type qp = id.find('?');
if(qp==string::npos || qp<fp)
id.erase(fp);
else if(qp>fp)
id.erase(fp,qp-fp);
}
result.normalized_id = util::rfc_3986_normalize_uri(id);
discover_at(result,id,xmode_html|xmode_xrd);
const char * eu = 0;
CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu);
if(r)
throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r);
result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */
if(xrds_location.empty()) {
html2xrd(result.xrd);
}else{
discover_at(result,xrds_location,xmode_xrd);
if(result.xrd.empty())
html2xrd(result.xrd);
}
}
}
void discover_at(idiscovery_t& result,const string& url,int xm) {
CURLcode r = easy_setopt(CURLOPT_URL,url.c_str());
if(r)
throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r);
- (*(expat_t*)this) = parser_create_ns();
- set_user_data(); set_element_handler();
- set_character_data_handler();
-
http_content_type.clear();
xmode = xm;
+ prepare_to_parse();
if(xmode&xmode_html) {
xrds_location.clear();
- html_openid1.clear(); html_openid2.clear();
+ save_html.clear();
+ save_html.reserve(max_html);
}
xrd = &result.xrd;
- cdata = 0; xrd_service = 0; skipping = 0;
- status_code = 100; status_string.clear();
r = easy_perform();
if(r && r!=CURLE_WRITE_ERROR)
throw exception_curl(OPKELE_CP_ "failed to perform curly request",r);
- parse(0,0,true);
+ if(!parser_choked) {
+ parse(0,0,true);
+ }else{
+ /* TODO: do not bother if we've seen xml */
+ try {
+ util::tidy_doc_t td = util::tidy_doc_t::create();
+ if(!td)
+ throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document");
+#ifndef NDEBUG
+ td.opt_set(TidyQuiet,false);
+ td.opt_set(TidyShowWarnings,false);
+ td.opt_set(TidyForceOutput,true);
+ td.opt_set(TidyXhtmlOut,true);
+ td.opt_set(TidyDoctypeMode,TidyDoctypeOmit);
+ td.opt_set(TidyMark,false);
+#endif /* NDEBUG */
+ if(td.parse_string(save_html)<=0)
+ throw exception_tidy(OPKELE_CP_ "tidy failed to parse document");
+ if(td.clean_and_repair()<=0)
+ throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair");
+ util::tidy_buf_t tide;
+ if(td.save_buffer(tide)<=0)
+ throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer");
+ prepare_to_parse();
+ parse(tide.c_str(),tide.size(),true);
+ }catch(exception_tidy& et) { }
+ }
+ save_html.clear();
+ }
+
+ void prepare_to_parse() {
+ (*(expat_t*)this) = parser_create_ns();
+ set_user_data(); set_element_handler();
+ set_character_data_handler();
+
+ if(xmode&xmode_html) {
+ html_openid1.clear(); html_openid2.clear();
+ parser_choked = false;
+ }
+
+ cdata = 0; xrd_service = 0; skipping = 0;
+ status_code = 100; status_string.clear();
}
void html2xrd(XRD_t& x) {
if(!html_openid1.uris.empty()) {
html_openid1.types.insert(STURI_OPENID11);
x.services.add(-1,html_openid1);
}
if(!html_openid2.uris.empty()) {
html_openid2.types.insert(STURI_OPENID20);
x.services.add(-1,html_openid2);
}
}
size_t write(void *p,size_t s,size_t nm) {
- if(skipping<0) return 0;
/* TODO: limit total size */
size_t bytes = s*nm;
- bool rp = parse((const char *)p,bytes,false);
+ const char *inbuf = (const char*)p;
+ if(xmode&xmode_html) {
+ size_t mbts = save_html.capacity()-save_html.size();
+ size_t bts = 0;
+ if(mbts>0) {
+ bts = (bytes>mbts)?mbts:bytes;
+ save_html.append(inbuf,bts);
+ }
+ if(skipping<0) return bts;
+ }
+ if(skipping<0) return 0;
+ bool rp = parse(inbuf,bytes,false);
if(!rp) {
+ parser_choked = true;
skipping = -1;
- bytes = 0;
+ if(!(xmode&xmode_html))
+ bytes = 0;
}
return bytes;
}
size_t header(void *p,size_t s,size_t nm) {
size_t bytes = s*nm;
const char *h = (const char*)p;
const char *colon = (const char*)memchr(p,':',bytes);
const char *space = (const char*)memchr(p,' ',bytes);
if(space && ( (!colon) || space<colon ) ) {
xrds_location.clear(); http_content_type.clear();
}else if(colon) {
const char *hv = ++colon;
int hnl = colon-h;
int rb;
for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb);
while(rb>0 && isspace(hv[rb-1])) --rb;
if(rb) {
if( (hnl>=sizeof(XRDS_HEADER))
&& !strncasecmp(h,XRDS_HEADER":",
sizeof(XRDS_HEADER)) ) {
xrds_location.assign(hv,rb);
}else if( (hnl>=sizeof(CT_HEADER))
&& !strncasecmp(h,CT_HEADER":",
sizeof(CT_HEADER)) ) {
const char *sc = (const char*)memchr(
hv,';',rb);
http_content_type.assign(hv,sc?(sc-hv):rb);
}
}
}
return curl_t::header(p,s,nm);
}