-rw-r--r-- | lib/discovery.cc | 75 |
1 files changed, 64 insertions, 11 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc index 8729cfb..a308b56 100644 --- a/lib/discovery.cc +++ b/lib/discovery.cc @@ -1,95 +1,99 @@ #include <list> #include <opkele/curl.h> #include <opkele/expat.h> #include <opkele/uris.h> #include <opkele/discovery.h> #include <opkele/exception.h> #include <opkele/util.h> +#include <opkele/tidy.h> #include <opkele/debug.h> #include "config.h" #define XRDS_HEADER "X-XRDS-Location" #define CT_HEADER "Content-Type" namespace opkele { using std::list; using xrd::XRD_t; using xrd::service_t; static const char *whitespace = " \t\r\n"; static const char *i_leaders = "=@+$!("; + static const size_t max_html = 16384; static inline bool is_qelement(const XML_Char *n,const char *qen) { return !strcasecmp(n,qen); } static inline bool is_element(const XML_Char *n,const char *en) { if(!strcasecmp(n,en)) return true; int nl = strlen(n), enl = strlen(en); if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' && !strcasecmp(&n[nl-enl],en) ) return true; return false; } static long element_priority(const XML_Char **a) { for(;*a;++a) if(!strcasecmp(*(a++),"priority")) { long rv; return (sscanf(*a,"%ld",&rv)==1)?rv:-1; } return -1; } class idigger_t : public util::curl_t, public util::expat_t { public: string xri_proxy; enum { xmode_html = 1, xmode_xrd = 2 }; int xmode; string xrds_location; string http_content_type; service_t html_openid1; service_t html_openid2; string cdata_buf; long status_code; string status_string; typedef list<string> pt_stack_t; pt_stack_t pt_stack; int skipping; + bool parser_choked; + string save_html; XRD_t *xrd; service_t *xrd_service; string* cdata; idigger_t() : util::curl_t(easy_init()), util::expat_t(0), xri_proxy(XRI_PROXY_URL) { CURLcode r; (r=misc_sets()) || (r=set_write()) || (r=set_header()) ; if(r) throw exception_curl(OPKELE_CP_ "failed to set curly options",r); } ~idigger_t() throw() { } void discover(idiscovery_t& result,const string& identity) { result.clear(); string::size_type fsc = identity.find_first_not_of(whitespace); if(fsc==string::npos) throw bad_input(OPKELE_CP_ "whtiespace-only identity"); string::size_type lsc = identity.find_last_not_of(whitespace); assert(lsc!=string::npos); if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1)) fsc += sizeof("xri://")-1; if((fsc+1)>=lsc) throw bad_input(OPKELE_CP_ "not a character of importance in identity"); string id(identity,fsc,lsc-fsc+1); if(strchr(i_leaders,id[0])) { @@ -111,104 +115,153 @@ namespace opkele { if(id.find("://")==string::npos) id.insert(0,"http://"); string::size_type fp = id.find('#'); if(fp!=string::npos) { string::size_type qp = id.find('?'); if(qp==string::npos || qp<fp) id.erase(fp); else if(qp>fp) id.erase(fp,qp-fp); } result.normalized_id = util::rfc_3986_normalize_uri(id); discover_at(result,id,xmode_html|xmode_xrd); const char * eu = 0; CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu); if(r) throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r); result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ if(xrds_location.empty()) { html2xrd(result.xrd); }else{ discover_at(result,xrds_location,xmode_xrd); if(result.xrd.empty()) html2xrd(result.xrd); } } } void discover_at(idiscovery_t& result,const string& url,int xm) { CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); if(r) throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); - (*(expat_t*)this) = parser_create_ns(); - set_user_data(); set_element_handler(); - set_character_data_handler(); - http_content_type.clear(); xmode = xm; + prepare_to_parse(); if(xmode&xmode_html) { xrds_location.clear(); - html_openid1.clear(); html_openid2.clear(); + save_html.clear(); + save_html.reserve(max_html); } xrd = &result.xrd; - cdata = 0; xrd_service = 0; skipping = 0; - status_code = 100; status_string.clear(); r = easy_perform(); if(r && r!=CURLE_WRITE_ERROR) throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); - parse(0,0,true); + if(!parser_choked) { + parse(0,0,true); + }else{ + /* TODO: do not bother if we've seen xml */ + try { + util::tidy_doc_t td = util::tidy_doc_t::create(); + if(!td) + throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document"); +#ifndef NDEBUG + td.opt_set(TidyQuiet,false); + td.opt_set(TidyShowWarnings,false); + td.opt_set(TidyForceOutput,true); + td.opt_set(TidyXhtmlOut,true); + td.opt_set(TidyDoctypeMode,TidyDoctypeOmit); + td.opt_set(TidyMark,false); +#endif /* NDEBUG */ + if(td.parse_string(save_html)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to parse document"); + if(td.clean_and_repair()<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair"); + util::tidy_buf_t tide; + if(td.save_buffer(tide)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer"); + prepare_to_parse(); + parse(tide.c_str(),tide.size(),true); + }catch(exception_tidy& et) { } + } + save_html.clear(); + } + + void prepare_to_parse() { + (*(expat_t*)this) = parser_create_ns(); + set_user_data(); set_element_handler(); + set_character_data_handler(); + + if(xmode&xmode_html) { + html_openid1.clear(); html_openid2.clear(); + parser_choked = false; + } + + cdata = 0; xrd_service = 0; skipping = 0; + status_code = 100; status_string.clear(); } void html2xrd(XRD_t& x) { if(!html_openid1.uris.empty()) { html_openid1.types.insert(STURI_OPENID11); x.services.add(-1,html_openid1); } if(!html_openid2.uris.empty()) { html_openid2.types.insert(STURI_OPENID20); x.services.add(-1,html_openid2); } } size_t write(void *p,size_t s,size_t nm) { - if(skipping<0) return 0; /* TODO: limit total size */ size_t bytes = s*nm; - bool rp = parse((const char *)p,bytes,false); + const char *inbuf = (const char*)p; + if(xmode&xmode_html) { + size_t mbts = save_html.capacity()-save_html.size(); + size_t bts = 0; + if(mbts>0) { + bts = (bytes>mbts)?mbts:bytes; + save_html.append(inbuf,bts); + } + if(skipping<0) return bts; + } + if(skipping<0) return 0; + bool rp = parse(inbuf,bytes,false); if(!rp) { + parser_choked = true; skipping = -1; - bytes = 0; + if(!(xmode&xmode_html)) + bytes = 0; } return bytes; } size_t header(void *p,size_t s,size_t nm) { size_t bytes = s*nm; const char *h = (const char*)p; const char *colon = (const char*)memchr(p,':',bytes); const char *space = (const char*)memchr(p,' ',bytes); if(space && ( (!colon) || space<colon ) ) { xrds_location.clear(); http_content_type.clear(); }else if(colon) { const char *hv = ++colon; int hnl = colon-h; int rb; for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); while(rb>0 && isspace(hv[rb-1])) --rb; if(rb) { if( (hnl>=sizeof(XRDS_HEADER)) && !strncasecmp(h,XRDS_HEADER":", sizeof(XRDS_HEADER)) ) { xrds_location.assign(hv,rb); }else if( (hnl>=sizeof(CT_HEADER)) && !strncasecmp(h,CT_HEADER":", sizeof(CT_HEADER)) ) { const char *sc = (const char*)memchr( hv,';',rb); http_content_type.assign(hv,sc?(sc-hv):rb); } } } return curl_t::header(p,s,nm); } |