summaryrefslogtreecommitdiffabout
path: root/lib/discovery.cc
authorMichael Krelin <hacker@klever.net>2008-01-05 21:47:04 (UTC)
committer Michael Krelin <hacker@klever.net>2008-01-05 22:03:59 (UTC)
commit76f52a8fd79dd12680752c017d67d4be01f0afbc (patch) (unidiff)
tree42d640112a0381707d36ac3d72e48937978d911e /lib/discovery.cc
parenta0719fb611507d8b9962b87c600855d8837fc266 (diff)
downloadlibopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.zip
libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.gz
libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.bz2
made more robust html discovery by using htmltidy
now when parsing document that we expect might be html we also save first 16K of the document to the buffer and if the parser choked we run the saved data through htmltidy and feed the output to the parser again. Signed-off-by: Michael Krelin <hacker@klever.net>
Diffstat (limited to 'lib/discovery.cc') (more/less context) (ignore whitespace changes)
-rw-r--r--lib/discovery.cc75
1 files changed, 64 insertions, 11 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc
index 8729cfb..a308b56 100644
--- a/lib/discovery.cc
+++ b/lib/discovery.cc
@@ -1,230 +1,283 @@
1#include <list> 1#include <list>
2#include <opkele/curl.h> 2#include <opkele/curl.h>
3#include <opkele/expat.h> 3#include <opkele/expat.h>
4#include <opkele/uris.h> 4#include <opkele/uris.h>
5#include <opkele/discovery.h> 5#include <opkele/discovery.h>
6#include <opkele/exception.h> 6#include <opkele/exception.h>
7#include <opkele/util.h> 7#include <opkele/util.h>
8#include <opkele/tidy.h>
8#include <opkele/debug.h> 9#include <opkele/debug.h>
9 10
10#include "config.h" 11#include "config.h"
11 12
12#define XRDS_HEADER "X-XRDS-Location" 13#define XRDS_HEADER "X-XRDS-Location"
13#define CT_HEADER "Content-Type" 14#define CT_HEADER "Content-Type"
14 15
15namespace opkele { 16namespace opkele {
16 using std::list; 17 using std::list;
17 using xrd::XRD_t; 18 using xrd::XRD_t;
18 using xrd::service_t; 19 using xrd::service_t;
19 20
20 static const char *whitespace = " \t\r\n"; 21 static const char *whitespace = " \t\r\n";
21 static const char *i_leaders = "=@+$!("; 22 static const char *i_leaders = "=@+$!(";
23 static const size_t max_html = 16384;
22 24
23 static inline bool is_qelement(const XML_Char *n,const char *qen) { 25 static inline bool is_qelement(const XML_Char *n,const char *qen) {
24 return !strcasecmp(n,qen); 26 return !strcasecmp(n,qen);
25 } 27 }
26 static inline bool is_element(const XML_Char *n,const char *en) { 28 static inline bool is_element(const XML_Char *n,const char *en) {
27 if(!strcasecmp(n,en)) return true; 29 if(!strcasecmp(n,en)) return true;
28 int nl = strlen(n), enl = strlen(en); 30 int nl = strlen(n), enl = strlen(en);
29 if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' 31 if( (nl>=(enl+1)) && n[nl-enl-1]=='\t'
30 && !strcasecmp(&n[nl-enl],en) ) 32 && !strcasecmp(&n[nl-enl],en) )
31 return true; 33 return true;
32 return false; 34 return false;
33 } 35 }
34 36
35 static long element_priority(const XML_Char **a) { 37 static long element_priority(const XML_Char **a) {
36 for(;*a;++a) 38 for(;*a;++a)
37 if(!strcasecmp(*(a++),"priority")) { 39 if(!strcasecmp(*(a++),"priority")) {
38 long rv; 40 long rv;
39 return (sscanf(*a,"%ld",&rv)==1)?rv:-1; 41 return (sscanf(*a,"%ld",&rv)==1)?rv:-1;
40 } 42 }
41 return -1; 43 return -1;
42 } 44 }
43 45
44 class idigger_t : public util::curl_t, public util::expat_t { 46 class idigger_t : public util::curl_t, public util::expat_t {
45 public: 47 public:
46 string xri_proxy; 48 string xri_proxy;
47 49
48 enum { 50 enum {
49 xmode_html = 1, xmode_xrd = 2 51 xmode_html = 1, xmode_xrd = 2
50 }; 52 };
51 int xmode; 53 int xmode;
52 54
53 string xrds_location; 55 string xrds_location;
54 string http_content_type; 56 string http_content_type;
55 service_t html_openid1; 57 service_t html_openid1;
56 service_t html_openid2; 58 service_t html_openid2;
57 string cdata_buf; 59 string cdata_buf;
58 long status_code; 60 long status_code;
59 string status_string; 61 string status_string;
60 62
61 typedef list<string> pt_stack_t; 63 typedef list<string> pt_stack_t;
62 pt_stack_t pt_stack; 64 pt_stack_t pt_stack;
63 int skipping; 65 int skipping;
66 bool parser_choked;
67 string save_html;
64 68
65 XRD_t *xrd; 69 XRD_t *xrd;
66 service_t *xrd_service; 70 service_t *xrd_service;
67 string* cdata; 71 string* cdata;
68 72
69 idigger_t() 73 idigger_t()
70 : util::curl_t(easy_init()), 74 : util::curl_t(easy_init()),
71 util::expat_t(0), 75 util::expat_t(0),
72 xri_proxy(XRI_PROXY_URL) { 76 xri_proxy(XRI_PROXY_URL) {
73 CURLcode r; 77 CURLcode r;
74 (r=misc_sets()) 78 (r=misc_sets())
75 || (r=set_write()) 79 || (r=set_write())
76 || (r=set_header()) 80 || (r=set_header())
77 ; 81 ;
78 if(r) 82 if(r)
79 throw exception_curl(OPKELE_CP_ "failed to set curly options",r); 83 throw exception_curl(OPKELE_CP_ "failed to set curly options",r);
80 } 84 }
81 ~idigger_t() throw() { } 85 ~idigger_t() throw() { }
82 86
83 void discover(idiscovery_t& result,const string& identity) { 87 void discover(idiscovery_t& result,const string& identity) {
84 result.clear(); 88 result.clear();
85 string::size_type fsc = identity.find_first_not_of(whitespace); 89 string::size_type fsc = identity.find_first_not_of(whitespace);
86 if(fsc==string::npos) 90 if(fsc==string::npos)
87 throw bad_input(OPKELE_CP_ "whtiespace-only identity"); 91 throw bad_input(OPKELE_CP_ "whtiespace-only identity");
88 string::size_type lsc = identity.find_last_not_of(whitespace); 92 string::size_type lsc = identity.find_last_not_of(whitespace);
89 assert(lsc!=string::npos); 93 assert(lsc!=string::npos);
90 if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1)) 94 if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1))
91 fsc += sizeof("xri://")-1; 95 fsc += sizeof("xri://")-1;
92 if((fsc+1)>=lsc) 96 if((fsc+1)>=lsc)
93 throw bad_input(OPKELE_CP_ "not a character of importance in identity"); 97 throw bad_input(OPKELE_CP_ "not a character of importance in identity");
94 string id(identity,fsc,lsc-fsc+1); 98 string id(identity,fsc,lsc-fsc+1);
95 if(strchr(i_leaders,id[0])) { 99 if(strchr(i_leaders,id[0])) {
96 result.normalized_id = id; 100 result.normalized_id = id;
97 result.xri_identity = true; 101 result.xri_identity = true;
98 /* TODO: further canonicalize xri identity? Like folding case or whatever... */ 102 /* TODO: further canonicalize xri identity? Like folding case or whatever... */
99 discover_at( 103 discover_at(
100 result, 104 result,
101 xri_proxy + util::url_encode(id)+ 105 xri_proxy + util::url_encode(id)+
102 "?_xrd_r=application/xrd+xml;sep=false", xmode_xrd); 106 "?_xrd_r=application/xrd+xml;sep=false", xmode_xrd);
103 if(status_code!=100) 107 if(status_code!=100)
104 throw failed_xri_resolution(OPKELE_CP_ 108 throw failed_xri_resolution(OPKELE_CP_
105 "XRI resolution failed with '"+status_string+"' message",status_code); 109 "XRI resolution failed with '"+status_string+"' message",status_code);
106 if(result.xrd.canonical_ids.empty()) 110 if(result.xrd.canonical_ids.empty())
107 throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found"); 111 throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found");
108 result.canonicalized_id = result.xrd.canonical_ids.begin()->second; 112 result.canonicalized_id = result.xrd.canonical_ids.begin()->second;
109 }else{ 113 }else{
110 result.xri_identity = false; 114 result.xri_identity = false;
111 if(id.find("://")==string::npos) 115 if(id.find("://")==string::npos)
112 id.insert(0,"http://"); 116 id.insert(0,"http://");
113 string::size_type fp = id.find('#'); 117 string::size_type fp = id.find('#');
114 if(fp!=string::npos) { 118 if(fp!=string::npos) {
115 string::size_type qp = id.find('?'); 119 string::size_type qp = id.find('?');
116 if(qp==string::npos || qp<fp) 120 if(qp==string::npos || qp<fp)
117 id.erase(fp); 121 id.erase(fp);
118 else if(qp>fp) 122 else if(qp>fp)
119 id.erase(fp,qp-fp); 123 id.erase(fp,qp-fp);
120 } 124 }
121 result.normalized_id = util::rfc_3986_normalize_uri(id); 125 result.normalized_id = util::rfc_3986_normalize_uri(id);
122 discover_at(result,id,xmode_html|xmode_xrd); 126 discover_at(result,id,xmode_html|xmode_xrd);
123 const char * eu = 0; 127 const char * eu = 0;
124 CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu); 128 CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu);
125 if(r) 129 if(r)
126 throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r); 130 throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r);
127 result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ 131 result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */
128 if(xrds_location.empty()) { 132 if(xrds_location.empty()) {
129 html2xrd(result.xrd); 133 html2xrd(result.xrd);
130 }else{ 134 }else{
131 discover_at(result,xrds_location,xmode_xrd); 135 discover_at(result,xrds_location,xmode_xrd);
132 if(result.xrd.empty()) 136 if(result.xrd.empty())
133 html2xrd(result.xrd); 137 html2xrd(result.xrd);
134 } 138 }
135 } 139 }
136 } 140 }
137 141
138 void discover_at(idiscovery_t& result,const string& url,int xm) { 142 void discover_at(idiscovery_t& result,const string& url,int xm) {
139 CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); 143 CURLcode r = easy_setopt(CURLOPT_URL,url.c_str());
140 if(r) 144 if(r)
141 throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); 145 throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r);
142 146
143 (*(expat_t*)this) = parser_create_ns();
144 set_user_data(); set_element_handler();
145 set_character_data_handler();
146
147 http_content_type.clear(); 147 http_content_type.clear();
148 xmode = xm; 148 xmode = xm;
149 prepare_to_parse();
149 if(xmode&xmode_html) { 150 if(xmode&xmode_html) {
150 xrds_location.clear(); 151 xrds_location.clear();
151 html_openid1.clear(); html_openid2.clear(); 152 save_html.clear();
153 save_html.reserve(max_html);
152 } 154 }
153 xrd = &result.xrd; 155 xrd = &result.xrd;
154 cdata = 0; xrd_service = 0; skipping = 0;
155 status_code = 100; status_string.clear();
156 156
157 r = easy_perform(); 157 r = easy_perform();
158 if(r && r!=CURLE_WRITE_ERROR) 158 if(r && r!=CURLE_WRITE_ERROR)
159 throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); 159 throw exception_curl(OPKELE_CP_ "failed to perform curly request",r);
160 160
161 parse(0,0,true); 161 if(!parser_choked) {
162 parse(0,0,true);
163 }else{
164 /* TODO: do not bother if we've seen xml */
165 try {
166 util::tidy_doc_t td = util::tidy_doc_t::create();
167 if(!td)
168 throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document");
169#ifndef NDEBUG
170 td.opt_set(TidyQuiet,false);
171 td.opt_set(TidyShowWarnings,false);
172 td.opt_set(TidyForceOutput,true);
173 td.opt_set(TidyXhtmlOut,true);
174 td.opt_set(TidyDoctypeMode,TidyDoctypeOmit);
175 td.opt_set(TidyMark,false);
176#endif /* NDEBUG */
177 if(td.parse_string(save_html)<=0)
178 throw exception_tidy(OPKELE_CP_ "tidy failed to parse document");
179 if(td.clean_and_repair()<=0)
180 throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair");
181 util::tidy_buf_t tide;
182 if(td.save_buffer(tide)<=0)
183 throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer");
184 prepare_to_parse();
185 parse(tide.c_str(),tide.size(),true);
186 }catch(exception_tidy& et) { }
187 }
188 save_html.clear();
189 }
190
191 void prepare_to_parse() {
192 (*(expat_t*)this) = parser_create_ns();
193 set_user_data(); set_element_handler();
194 set_character_data_handler();
195
196 if(xmode&xmode_html) {
197 html_openid1.clear(); html_openid2.clear();
198 parser_choked = false;
199 }
200
201 cdata = 0; xrd_service = 0; skipping = 0;
202 status_code = 100; status_string.clear();
162 } 203 }
163 204
164 void html2xrd(XRD_t& x) { 205 void html2xrd(XRD_t& x) {
165 if(!html_openid1.uris.empty()) { 206 if(!html_openid1.uris.empty()) {
166 html_openid1.types.insert(STURI_OPENID11); 207 html_openid1.types.insert(STURI_OPENID11);
167 x.services.add(-1,html_openid1); 208 x.services.add(-1,html_openid1);
168 } 209 }
169 if(!html_openid2.uris.empty()) { 210 if(!html_openid2.uris.empty()) {
170 html_openid2.types.insert(STURI_OPENID20); 211 html_openid2.types.insert(STURI_OPENID20);
171 x.services.add(-1,html_openid2); 212 x.services.add(-1,html_openid2);
172 } 213 }
173 } 214 }
174 215
175 size_t write(void *p,size_t s,size_t nm) { 216 size_t write(void *p,size_t s,size_t nm) {
176 if(skipping<0) return 0;
177 /* TODO: limit total size */ 217 /* TODO: limit total size */
178 size_t bytes = s*nm; 218 size_t bytes = s*nm;
179 bool rp = parse((const char *)p,bytes,false); 219 const char *inbuf = (const char*)p;
220 if(xmode&xmode_html) {
221 size_t mbts = save_html.capacity()-save_html.size();
222 size_t bts = 0;
223 if(mbts>0) {
224 bts = (bytes>mbts)?mbts:bytes;
225 save_html.append(inbuf,bts);
226 }
227 if(skipping<0) return bts;
228 }
229 if(skipping<0) return 0;
230 bool rp = parse(inbuf,bytes,false);
180 if(!rp) { 231 if(!rp) {
232 parser_choked = true;
181 skipping = -1; 233 skipping = -1;
182 bytes = 0; 234 if(!(xmode&xmode_html))
235 bytes = 0;
183 } 236 }
184 return bytes; 237 return bytes;
185 } 238 }
186 size_t header(void *p,size_t s,size_t nm) { 239 size_t header(void *p,size_t s,size_t nm) {
187 size_t bytes = s*nm; 240 size_t bytes = s*nm;
188 const char *h = (const char*)p; 241 const char *h = (const char*)p;
189 const char *colon = (const char*)memchr(p,':',bytes); 242 const char *colon = (const char*)memchr(p,':',bytes);
190 const char *space = (const char*)memchr(p,' ',bytes); 243 const char *space = (const char*)memchr(p,' ',bytes);
191 if(space && ( (!colon) || space<colon ) ) { 244 if(space && ( (!colon) || space<colon ) ) {
192 xrds_location.clear(); http_content_type.clear(); 245 xrds_location.clear(); http_content_type.clear();
193 }else if(colon) { 246 }else if(colon) {
194 const char *hv = ++colon; 247 const char *hv = ++colon;
195 int hnl = colon-h; 248 int hnl = colon-h;
196 int rb; 249 int rb;
197 for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); 250 for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb);
198 while(rb>0 && isspace(hv[rb-1])) --rb; 251 while(rb>0 && isspace(hv[rb-1])) --rb;
199 if(rb) { 252 if(rb) {
200 if( (hnl>=sizeof(XRDS_HEADER)) 253 if( (hnl>=sizeof(XRDS_HEADER))
201 && !strncasecmp(h,XRDS_HEADER":", 254 && !strncasecmp(h,XRDS_HEADER":",
202 sizeof(XRDS_HEADER)) ) { 255 sizeof(XRDS_HEADER)) ) {
203 xrds_location.assign(hv,rb); 256 xrds_location.assign(hv,rb);
204 }else if( (hnl>=sizeof(CT_HEADER)) 257 }else if( (hnl>=sizeof(CT_HEADER))
205 && !strncasecmp(h,CT_HEADER":", 258 && !strncasecmp(h,CT_HEADER":",
206 sizeof(CT_HEADER)) ) { 259 sizeof(CT_HEADER)) ) {
207 const char *sc = (const char*)memchr( 260 const char *sc = (const char*)memchr(
208 hv,';',rb); 261 hv,';',rb);
209 http_content_type.assign(hv,sc?(sc-hv):rb); 262 http_content_type.assign(hv,sc?(sc-hv):rb);
210 } 263 }
211 } 264 }
212 } 265 }
213 return curl_t::header(p,s,nm); 266 return curl_t::header(p,s,nm);
214 } 267 }
215 268
216 void start_element(const XML_Char *n,const XML_Char **a) { 269 void start_element(const XML_Char *n,const XML_Char **a) {
217 if(skipping<0) return; 270 if(skipping<0) return;
218 if(skipping) { 271 if(skipping) {
219 if(xmode&xmode_html) 272 if(xmode&xmode_html)
220 html_start_element(n,a); 273 html_start_element(n,a);
221 ++skipping; return; 274 ++skipping; return;
222 } 275 }
223 if(pt_stack.empty()) { 276 if(pt_stack.empty()) {
224 if(is_qelement(n,NSURI_XRDS "\tXRDS")) 277 if(is_qelement(n,NSURI_XRDS "\tXRDS"))
225 return; 278 return;
226 if(is_qelement(n,NSURI_XRD "\tXRD")) { 279 if(is_qelement(n,NSURI_XRD "\tXRD")) {
227 assert(xrd); 280 assert(xrd);
228 xrd->clear(); 281 xrd->clear();
229 pt_stack.push_back(n); 282 pt_stack.push_back(n);
230 }else if(xmode&xmode_html) { 283 }else if(xmode&xmode_html) {