-rw-r--r-- | lib/discovery.cc | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc new file mode 100644 index 0000000..d868308 --- a/dev/null +++ b/lib/discovery.cc | |||
@@ -0,0 +1,446 @@ | |||
1 | #include <list> | ||
2 | #include <opkele/curl.h> | ||
3 | #include <opkele/expat.h> | ||
4 | #include <opkele/uris.h> | ||
5 | #include <opkele/discovery.h> | ||
6 | #include <opkele/exception.h> | ||
7 | #include <opkele/util.h> | ||
8 | #include <opkele/tidy.h> | ||
9 | #include <opkele/debug.h> | ||
10 | |||
11 | #include "config.h" | ||
12 | |||
13 | #define XRDS_HEADER "X-XRDS-Location" | ||
14 | #define CT_HEADER "Content-Type" | ||
15 | |||
16 | namespace opkele { | ||
17 | using std::list; | ||
18 | using xrd::XRD_t; | ||
19 | using xrd::service_t; | ||
20 | |||
21 | static const char *whitespace = " \t\r\n"; | ||
22 | static const char *i_leaders = "=@+$!("; | ||
23 | static const size_t max_html = 16384; | ||
24 | |||
25 | static inline bool is_qelement(const XML_Char *n,const char *qen) { | ||
26 | return !strcasecmp(n,qen); | ||
27 | } | ||
28 | static inline bool is_element(const XML_Char *n,const char *en) { | ||
29 | if(!strcasecmp(n,en)) return true; | ||
30 | int nl = strlen(n), enl = strlen(en); | ||
31 | if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' | ||
32 | && !strcasecmp(&n[nl-enl],en) ) | ||
33 | return true; | ||
34 | return false; | ||
35 | } | ||
36 | |||
37 | static long element_priority(const XML_Char **a) { | ||
38 | for(;*a;++a) | ||
39 | if(!strcasecmp(*(a++),"priority")) { | ||
40 | long rv; | ||
41 | return (sscanf(*a,"%ld",&rv)==1)?rv:-1; | ||
42 | } | ||
43 | return -1; | ||
44 | } | ||
45 | |||
46 | class idigger_t : public util::curl_t, public util::expat_t { | ||
47 | public: | ||
48 | string xri_proxy; | ||
49 | |||
50 | enum { | ||
51 | xmode_html = 1, xmode_xrd = 2 | ||
52 | }; | ||
53 | int xmode; | ||
54 | |||
55 | string xrds_location; | ||
56 | string http_content_type; | ||
57 | service_t html_openid1; | ||
58 | service_t html_openid2; | ||
59 | string cdata_buf; | ||
60 | long status_code; | ||
61 | string status_string; | ||
62 | |||
63 | typedef list<string> pt_stack_t; | ||
64 | pt_stack_t pt_stack; | ||
65 | int skipping; | ||
66 | bool parser_choked; | ||
67 | string save_html; | ||
68 | |||
69 | XRD_t *xrd; | ||
70 | service_t *xrd_service; | ||
71 | string* cdata; | ||
72 | |||
73 | idigger_t() | ||
74 | : util::curl_t(easy_init()), | ||
75 | util::expat_t(0), | ||
76 | xri_proxy(XRI_PROXY_URL) { | ||
77 | CURLcode r; | ||
78 | (r=misc_sets()) | ||
79 | || (r=set_write()) | ||
80 | || (r=set_header()) | ||
81 | ; | ||
82 | if(r) | ||
83 | throw exception_curl(OPKELE_CP_ "failed to set curly options",r); | ||
84 | } | ||
85 | ~idigger_t() throw() { } | ||
86 | |||
87 | void discover(idiscovery_t& result,const string& identity) { | ||
88 | result.clear(); | ||
89 | string::size_type fsc = identity.find_first_not_of(whitespace); | ||
90 | if(fsc==string::npos) | ||
91 | throw bad_input(OPKELE_CP_ "whtiespace-only identity"); | ||
92 | string::size_type lsc = identity.find_last_not_of(whitespace); | ||
93 | assert(lsc!=string::npos); | ||
94 | if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1)) | ||
95 | fsc += sizeof("xri://")-1; | ||
96 | if((fsc+1)>=lsc) | ||
97 | throw bad_input(OPKELE_CP_ "not a character of importance in identity"); | ||
98 | string id(identity,fsc,lsc-fsc+1); | ||
99 | if(strchr(i_leaders,id[0])) { | ||
100 | result.normalized_id = id; | ||
101 | result.xri_identity = true; | ||
102 | /* TODO: further canonicalize xri identity? Like folding case or whatever... */ | ||
103 | discover_at( | ||
104 | result, | ||
105 | xri_proxy + util::url_encode(id)+ | ||
106 | "?_xrd_r=application/xrd+xml;sep=false", xmode_xrd); | ||
107 | if(status_code!=100) | ||
108 | throw failed_xri_resolution(OPKELE_CP_ | ||
109 | "XRI resolution failed with '"+status_string+"' message",status_code); | ||
110 | if(result.xrd.canonical_ids.empty()) | ||
111 | throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found"); | ||
112 | result.canonicalized_id = result.xrd.canonical_ids.begin()->second; | ||
113 | }else{ | ||
114 | result.xri_identity = false; | ||
115 | if(id.find("://")==string::npos) | ||
116 | id.insert(0,"http://"); | ||
117 | string::size_type fp = id.find('#'); | ||
118 | if(fp!=string::npos) { | ||
119 | string::size_type qp = id.find('?'); | ||
120 | if(qp==string::npos || qp<fp) | ||
121 | id.erase(fp); | ||
122 | else if(qp>fp) | ||
123 | id.erase(fp,qp-fp); | ||
124 | } | ||
125 | result.normalized_id = util::rfc_3986_normalize_uri(id); | ||
126 | discover_at(result,id,xmode_html|xmode_xrd); | ||
127 | const char * eu = 0; | ||
128 | CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu); | ||
129 | if(r) | ||
130 | throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r); | ||
131 | result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ | ||
132 | if(xrds_location.empty()) { | ||
133 | html2xrd(result.xrd); | ||
134 | }else{ | ||
135 | discover_at(result,xrds_location,xmode_xrd); | ||
136 | if(result.xrd.empty()) | ||
137 | html2xrd(result.xrd); | ||
138 | } | ||
139 | } | ||
140 | } | ||
141 | |||
142 | void discover_at(idiscovery_t& result,const string& url,int xm) { | ||
143 | CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); | ||
144 | if(r) | ||
145 | throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); | ||
146 | |||
147 | http_content_type.clear(); | ||
148 | xmode = xm; | ||
149 | prepare_to_parse(); | ||
150 | if(xmode&xmode_html) { | ||
151 | xrds_location.clear(); | ||
152 | save_html.clear(); | ||
153 | save_html.reserve(max_html); | ||
154 | } | ||
155 | xrd = &result.xrd; | ||
156 | |||
157 | r = easy_perform(); | ||
158 | if(r && r!=CURLE_WRITE_ERROR) | ||
159 | throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); | ||
160 | |||
161 | if(!parser_choked) { | ||
162 | parse(0,0,true); | ||
163 | }else{ | ||
164 | /* TODO: do not bother if we've seen xml */ | ||
165 | try { | ||
166 | util::tidy_doc_t td = util::tidy_doc_t::create(); | ||
167 | if(!td) | ||
168 | throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document"); | ||
169 | #ifndef NDEBUG | ||
170 | td.opt_set(TidyQuiet,false); | ||
171 | td.opt_set(TidyShowWarnings,false); | ||
172 | #endif /* NDEBUG */ | ||
173 | td.opt_set(TidyForceOutput,true); | ||
174 | td.opt_set(TidyXhtmlOut,true); | ||
175 | td.opt_set(TidyDoctypeMode,TidyDoctypeOmit); | ||
176 | td.opt_set(TidyMark,false); | ||
177 | if(td.parse_string(save_html)<=0) | ||
178 | throw exception_tidy(OPKELE_CP_ "tidy failed to parse document"); | ||
179 | if(td.clean_and_repair()<=0) | ||
180 | throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair"); | ||
181 | util::tidy_buf_t tide; | ||
182 | if(td.save_buffer(tide)<=0) | ||
183 | throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer"); | ||
184 | prepare_to_parse(); | ||
185 | parse(tide.c_str(),tide.size(),true); | ||
186 | }catch(exception_tidy& et) { } | ||
187 | } | ||
188 | save_html.clear(); | ||
189 | } | ||
190 | |||
191 | void prepare_to_parse() { | ||
192 | (*(expat_t*)this) = parser_create_ns(); | ||
193 | set_user_data(); set_element_handler(); | ||
194 | set_character_data_handler(); | ||
195 | |||
196 | if(xmode&xmode_html) { | ||
197 | html_openid1.clear(); html_openid2.clear(); | ||
198 | parser_choked = false; | ||
199 | } | ||
200 | |||
201 | cdata = 0; xrd_service = 0; skipping = 0; | ||
202 | status_code = 100; status_string.clear(); | ||
203 | } | ||
204 | |||
205 | void html2xrd(XRD_t& x) { | ||
206 | if(!html_openid1.uris.empty()) { | ||
207 | html_openid1.types.insert(STURI_OPENID11); | ||
208 | x.services.add(-1,html_openid1); | ||
209 | } | ||
210 | if(!html_openid2.uris.empty()) { | ||
211 | html_openid2.types.insert(STURI_OPENID20); | ||
212 | x.services.add(-1,html_openid2); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | size_t write(void *p,size_t s,size_t nm) { | ||
217 | /* TODO: limit total size */ | ||
218 | size_t bytes = s*nm; | ||
219 | const char *inbuf = (const char*)p; | ||
220 | if(xmode&xmode_html) { | ||
221 | size_t mbts = save_html.capacity()-save_html.size(); | ||
222 | size_t bts = 0; | ||
223 | if(mbts>0) { | ||
224 | bts = (bytes>mbts)?mbts:bytes; | ||
225 | save_html.append(inbuf,bts); | ||
226 | } | ||
227 | if(skipping<0) return bts; | ||
228 | } | ||
229 | if(skipping<0) return 0; | ||
230 | bool rp = parse(inbuf,bytes,false); | ||
231 | if(!rp) { | ||
232 | parser_choked = true; | ||
233 | skipping = -1; | ||
234 | if(!(xmode&xmode_html)) | ||
235 | bytes = 0; | ||
236 | } | ||
237 | return bytes; | ||
238 | } | ||
239 | size_t header(void *p,size_t s,size_t nm) { | ||
240 | size_t bytes = s*nm; | ||
241 | const char *h = (const char*)p; | ||
242 | const char *colon = (const char*)memchr(p,':',bytes); | ||
243 | const char *space = (const char*)memchr(p,' ',bytes); | ||
244 | if(space && ( (!colon) || space<colon ) ) { | ||
245 | xrds_location.clear(); http_content_type.clear(); | ||
246 | }else if(colon) { | ||
247 | const char *hv = ++colon; | ||
248 | int hnl = colon-h; | ||
249 | int rb; | ||
250 | for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); | ||
251 | while(rb>0 && isspace(hv[rb-1])) --rb; | ||
252 | if(rb) { | ||
253 | if( (hnl>=sizeof(XRDS_HEADER)) | ||
254 | && !strncasecmp(h,XRDS_HEADER":", | ||
255 | sizeof(XRDS_HEADER)) ) { | ||
256 | xrds_location.assign(hv,rb); | ||
257 | }else if( (hnl>=sizeof(CT_HEADER)) | ||
258 | && !strncasecmp(h,CT_HEADER":", | ||
259 | sizeof(CT_HEADER)) ) { | ||
260 | const char *sc = (const char*)memchr( | ||
261 | hv,';',rb); | ||
262 | http_content_type.assign(hv,sc?(sc-hv):rb); | ||
263 | } | ||
264 | } | ||
265 | } | ||
266 | return curl_t::header(p,s,nm); | ||
267 | } | ||
268 | |||
269 | void start_element(const XML_Char *n,const XML_Char **a) { | ||
270 | if(skipping<0) return; | ||
271 | if(skipping) { | ||
272 | if(xmode&xmode_html) | ||
273 | html_start_element(n,a); | ||
274 | ++skipping; return; | ||
275 | } | ||
276 | if(pt_stack.empty()) { | ||
277 | if(is_qelement(n,NSURI_XRDS "\tXRDS")) | ||
278 | return; | ||
279 | if(is_qelement(n,NSURI_XRD "\tXRD")) { | ||
280 | assert(xrd); | ||
281 | xrd->clear(); | ||
282 | pt_stack.push_back(n); | ||
283 | }else if(xmode&xmode_html) { | ||
284 | html_start_element(n,a); | ||
285 | }else{ | ||
286 | skipping = -1; | ||
287 | } | ||
288 | }else{ | ||
289 | int pt_s = pt_stack.size(); | ||
290 | if(pt_s==1) { | ||
291 | if(is_qelement(n,NSURI_XRD "\tCanonicalID")) { | ||
292 | assert(xrd); | ||
293 | cdata = &(xrd->canonical_ids.add(element_priority(a),string())); | ||
294 | }else if(is_qelement(n,NSURI_XRD "\tLocalID")) { | ||
295 | assert(xrd); | ||
296 | cdata = &(xrd->local_ids.add(element_priority(a),string())); | ||
297 | }else if(is_qelement(n,NSURI_XRD "\tProviderID")) { | ||
298 | assert(xrd); | ||
299 | cdata = &(xrd->provider_id); | ||
300 | }else if(is_qelement(n,NSURI_XRD "\tService")) { | ||
301 | assert(xrd); | ||
302 | xrd_service = &(xrd->services.add(element_priority(a), | ||
303 | service_t())); | ||
304 | pt_stack.push_back(n); | ||
305 | }else if(is_qelement(n,NSURI_XRD "\tStatus")) { | ||
306 | for(;*a;) { | ||
307 | if(!strcasecmp(*(a++),"code")) { | ||
308 | if(sscanf(*(a++),"%ld",&status_code)==1 && status_code!=100) { | ||
309 | cdata = &status_string; | ||
310 | pt_stack.push_back(n); | ||
311 | break; | ||
312 | } | ||
313 | } | ||
314 | } | ||
315 | }else if(is_qelement(n,NSURI_XRD "\tExpires")) { | ||
316 | assert(xrd); | ||
317 | cdata_buf.clear(); | ||
318 | cdata = &cdata_buf; | ||
319 | }else if(xmode&xmode_html) { | ||
320 | html_start_element(n,a); | ||
321 | }else{ | ||
322 | skipping = 1; | ||
323 | } | ||
324 | }else if(pt_s==2) { | ||
325 | if(is_qelement(pt_stack.back().c_str(), NSURI_XRD "\tService")) { | ||
326 | if(is_qelement(n,NSURI_XRD "\tType")) { | ||
327 | assert(xrd); assert(xrd_service); | ||
328 | cdata_buf.clear(); | ||
329 | cdata = &cdata_buf; | ||
330 | }else if(is_qelement(n,NSURI_XRD "\tURI")) { | ||
331 | assert(xrd); assert(xrd_service); | ||
332 | cdata = &(xrd_service->uris.add(element_priority(a),string())); | ||
333 | }else if(is_qelement(n,NSURI_XRD "\tLocalID") | ||
334 | || is_qelement(n,NSURI_OPENID10 "\tDelegate") ) { | ||
335 | assert(xrd); assert(xrd_service); | ||
336 | cdata = &(xrd_service->local_ids.add(element_priority(a),string())); | ||
337 | }else if(is_qelement(n,NSURI_XRD "\tProviderID")) { | ||
338 | assert(xrd); assert(xrd_service); | ||
339 | cdata = &(xrd_service->provider_id); | ||
340 | }else{ | ||
341 | skipping = 1; | ||
342 | } | ||
343 | }else | ||
344 | skipping = 1; | ||
345 | }else if(xmode&xmode_html) { | ||
346 | html_start_element(n,a); | ||
347 | }else{ | ||
348 | skipping = 1; | ||
349 | } | ||
350 | } | ||
351 | } | ||
352 | void end_element(const XML_Char *n) { | ||
353 | if(skipping<0) return; | ||
354 | if(skipping) { | ||
355 | --skipping; return; | ||
356 | } | ||
357 | if(is_qelement(n,NSURI_XRD "\tType")) { | ||
358 | assert(xrd); assert(xrd_service); assert(cdata==&cdata_buf); | ||
359 | xrd_service->types.insert(cdata_buf); | ||
360 | }else if(is_qelement(n,NSURI_XRD "\tService")) { | ||
361 | assert(xrd); assert(xrd_service); | ||
362 | assert(!pt_stack.empty()); | ||
363 | assert(pt_stack.back()==(NSURI_XRD "\tService")); | ||
364 | pt_stack.pop_back(); | ||
365 | xrd_service = 0; | ||
366 | }else if(is_qelement(n,NSURI_XRD "\tStatus")) { | ||
367 | assert(xrd); | ||
368 | if(is_qelement(pt_stack.back().c_str(),n)) { | ||
369 | assert(cdata==&status_string); | ||
370 | pt_stack.pop_back(); | ||
371 | if(status_code!=100) | ||
372 | skipping = -1; | ||
373 | } | ||
374 | }else if(is_qelement(n,NSURI_XRD "\tExpires")) { | ||
375 | assert(xrd); | ||
376 | xrd->expires = util::w3c_to_time(cdata_buf); | ||
377 | }else if((xmode&xmode_html) && is_element(n,"head")) { | ||
378 | skipping = -1; | ||
379 | } | ||
380 | cdata = 0; | ||
381 | } | ||
382 | void character_data(const XML_Char *s,int l) { | ||
383 | if(skipping) return; | ||
384 | if(cdata) cdata->append(s,l); | ||
385 | } | ||
386 | |||
387 | void html_start_element(const XML_Char *n,const XML_Char **a) { | ||
388 | if(is_element(n,"meta")) { | ||
389 | bool heq = false; | ||
390 | string l; | ||
391 | for(;*a;a+=2) { | ||
392 | if(!( strcasecmp(a[0],"http-equiv") | ||
393 | || strcasecmp(a[1],XRDS_HEADER) )) | ||
394 | heq = true; | ||
395 | else if(!strcasecmp(a[0],"content")) | ||
396 | l.assign(a[1]); | ||
397 | } | ||
398 | if(heq) | ||
399 | xrds_location = l; | ||
400 | }else if(is_element(n,"link")) { | ||
401 | string rels; | ||
402 | string href; | ||
403 | for(;*a;a+=2) { | ||
404 | if( !strcasecmp(a[0],"rel") ) { | ||
405 | rels.assign(a[1]); | ||
406 | }else if( !strcasecmp(a[0],"href") ) { | ||
407 | const char *ns = a[1]; | ||
408 | for(;*ns && isspace(*ns);++ns); | ||
409 | href.assign(ns); | ||
410 | string::size_type lns=href.find_last_not_of(whitespace); | ||
411 | href.erase(lns+1); | ||
412 | } | ||
413 | } | ||
414 | for(string::size_type ns=rels.find_first_not_of(whitespace); | ||
415 | ns!=string::npos; ns=rels.find_first_not_of(whitespace,ns)) { | ||
416 | string::size_type s = rels.find_first_of(whitespace,ns); | ||
417 | string rel; | ||
418 | if(s==string::npos) { | ||
419 | rel.assign(rels,ns,string::npos); | ||
420 | ns = string::npos; | ||
421 | }else{ | ||
422 | rel.assign(rels,ns,s-ns); | ||
423 | ns = s; | ||
424 | } | ||
425 | if(rel=="openid.server") | ||
426 | html_openid1.uris.add(-1,href); | ||
427 | else if(rel=="openid.delegate") | ||
428 | html_openid1.local_ids.add(-1,href); | ||
429 | else if(rel=="openid2.provider") | ||
430 | html_openid2.uris.add(-1,href); | ||
431 | else if(rel=="openid2.local_id") | ||
432 | html_openid2.local_ids.add(-1,href); | ||
433 | } | ||
434 | }else if(is_element(n,"body")) { | ||
435 | skipping = -1; | ||
436 | } | ||
437 | } | ||
438 | |||
439 | }; | ||
440 | |||
441 | void idiscover(idiscovery_t& result,const string& identity) { | ||
442 | idigger_t idigger; | ||
443 | idigger.discover(result,identity); | ||
444 | } | ||
445 | |||
446 | } | ||