summaryrefslogtreecommitdiffabout
path: root/lib/discovery.cc
authorMichael Krelin <hacker@klever.net>2008-01-05 21:47:04 (UTC)
committer Michael Krelin <hacker@klever.net>2008-01-05 22:03:59 (UTC)
commit76f52a8fd79dd12680752c017d67d4be01f0afbc (patch) (unidiff)
tree42d640112a0381707d36ac3d72e48937978d911e /lib/discovery.cc
parenta0719fb611507d8b9962b87c600855d8837fc266 (diff)
downloadlibopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.zip
libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.gz
libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.bz2
made more robust html discovery by using htmltidy
now when parsing document that we expect might be html we also save first 16K of the document to the buffer and if the parser choked we run the saved data through htmltidy and feed the output to the parser again. Signed-off-by: Michael Krelin <hacker@klever.net>
Diffstat (limited to 'lib/discovery.cc') (more/less context) (ignore whitespace changes)
-rw-r--r--lib/discovery.cc75
1 files changed, 64 insertions, 11 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc
index 8729cfb..a308b56 100644
--- a/lib/discovery.cc
+++ b/lib/discovery.cc
@@ -1,37 +1,39 @@
1#include <list> 1#include <list>
2#include <opkele/curl.h> 2#include <opkele/curl.h>
3#include <opkele/expat.h> 3#include <opkele/expat.h>
4#include <opkele/uris.h> 4#include <opkele/uris.h>
5#include <opkele/discovery.h> 5#include <opkele/discovery.h>
6#include <opkele/exception.h> 6#include <opkele/exception.h>
7#include <opkele/util.h> 7#include <opkele/util.h>
8#include <opkele/tidy.h>
8#include <opkele/debug.h> 9#include <opkele/debug.h>
9 10
10#include "config.h" 11#include "config.h"
11 12
12#define XRDS_HEADER "X-XRDS-Location" 13#define XRDS_HEADER "X-XRDS-Location"
13#define CT_HEADER "Content-Type" 14#define CT_HEADER "Content-Type"
14 15
15namespace opkele { 16namespace opkele {
16 using std::list; 17 using std::list;
17 using xrd::XRD_t; 18 using xrd::XRD_t;
18 using xrd::service_t; 19 using xrd::service_t;
19 20
20 static const char *whitespace = " \t\r\n"; 21 static const char *whitespace = " \t\r\n";
21 static const char *i_leaders = "=@+$!("; 22 static const char *i_leaders = "=@+$!(";
23 static const size_t max_html = 16384;
22 24
23 static inline bool is_qelement(const XML_Char *n,const char *qen) { 25 static inline bool is_qelement(const XML_Char *n,const char *qen) {
24 return !strcasecmp(n,qen); 26 return !strcasecmp(n,qen);
25 } 27 }
26 static inline bool is_element(const XML_Char *n,const char *en) { 28 static inline bool is_element(const XML_Char *n,const char *en) {
27 if(!strcasecmp(n,en)) return true; 29 if(!strcasecmp(n,en)) return true;
28 int nl = strlen(n), enl = strlen(en); 30 int nl = strlen(n), enl = strlen(en);
29 if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' 31 if( (nl>=(enl+1)) && n[nl-enl-1]=='\t'
30 && !strcasecmp(&n[nl-enl],en) ) 32 && !strcasecmp(&n[nl-enl],en) )
31 return true; 33 return true;
32 return false; 34 return false;
33 } 35 }
34 36
35 static long element_priority(const XML_Char **a) { 37 static long element_priority(const XML_Char **a) {
36 for(;*a;++a) 38 for(;*a;++a)
37 if(!strcasecmp(*(a++),"priority")) { 39 if(!strcasecmp(*(a++),"priority")) {
@@ -48,32 +50,34 @@ namespace opkele {
48 enum { 50 enum {
49 xmode_html = 1, xmode_xrd = 2 51 xmode_html = 1, xmode_xrd = 2
50 }; 52 };
51 int xmode; 53 int xmode;
52 54
53 string xrds_location; 55 string xrds_location;
54 string http_content_type; 56 string http_content_type;
55 service_t html_openid1; 57 service_t html_openid1;
56 service_t html_openid2; 58 service_t html_openid2;
57 string cdata_buf; 59 string cdata_buf;
58 long status_code; 60 long status_code;
59 string status_string; 61 string status_string;
60 62
61 typedef list<string> pt_stack_t; 63 typedef list<string> pt_stack_t;
62 pt_stack_t pt_stack; 64 pt_stack_t pt_stack;
63 int skipping; 65 int skipping;
66 bool parser_choked;
67 string save_html;
64 68
65 XRD_t *xrd; 69 XRD_t *xrd;
66 service_t *xrd_service; 70 service_t *xrd_service;
67 string* cdata; 71 string* cdata;
68 72
69 idigger_t() 73 idigger_t()
70 : util::curl_t(easy_init()), 74 : util::curl_t(easy_init()),
71 util::expat_t(0), 75 util::expat_t(0),
72 xri_proxy(XRI_PROXY_URL) { 76 xri_proxy(XRI_PROXY_URL) {
73 CURLcode r; 77 CURLcode r;
74 (r=misc_sets()) 78 (r=misc_sets())
75 || (r=set_write()) 79 || (r=set_write())
76 || (r=set_header()) 80 || (r=set_header())
77 ; 81 ;
78 if(r) 82 if(r)
79 throw exception_curl(OPKELE_CP_ "failed to set curly options",r); 83 throw exception_curl(OPKELE_CP_ "failed to set curly options",r);
@@ -127,72 +131,121 @@ namespace opkele {
127 result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ 131 result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */
128 if(xrds_location.empty()) { 132 if(xrds_location.empty()) {
129 html2xrd(result.xrd); 133 html2xrd(result.xrd);
130 }else{ 134 }else{
131 discover_at(result,xrds_location,xmode_xrd); 135 discover_at(result,xrds_location,xmode_xrd);
132 if(result.xrd.empty()) 136 if(result.xrd.empty())
133 html2xrd(result.xrd); 137 html2xrd(result.xrd);
134 } 138 }
135 } 139 }
136 } 140 }
137 141
138 void discover_at(idiscovery_t& result,const string& url,int xm) { 142 void discover_at(idiscovery_t& result,const string& url,int xm) {
139 CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); 143 CURLcode r = easy_setopt(CURLOPT_URL,url.c_str());
140 if(r) 144 if(r)
141 throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); 145 throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r);
142 146
143 (*(expat_t*)this) = parser_create_ns();
144 set_user_data(); set_element_handler();
145 set_character_data_handler();
146
147 http_content_type.clear(); 147 http_content_type.clear();
148 xmode = xm; 148 xmode = xm;
149 prepare_to_parse();
149 if(xmode&xmode_html) { 150 if(xmode&xmode_html) {
150 xrds_location.clear(); 151 xrds_location.clear();
151 html_openid1.clear(); html_openid2.clear(); 152 save_html.clear();
153 save_html.reserve(max_html);
152 } 154 }
153 xrd = &result.xrd; 155 xrd = &result.xrd;
154 cdata = 0; xrd_service = 0; skipping = 0;
155 status_code = 100; status_string.clear();
156 156
157 r = easy_perform(); 157 r = easy_perform();
158 if(r && r!=CURLE_WRITE_ERROR) 158 if(r && r!=CURLE_WRITE_ERROR)
159 throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); 159 throw exception_curl(OPKELE_CP_ "failed to perform curly request",r);
160 160
161 parse(0,0,true); 161 if(!parser_choked) {
162 parse(0,0,true);
163 }else{
164 /* TODO: do not bother if we've seen xml */
165 try {
166 util::tidy_doc_t td = util::tidy_doc_t::create();
167 if(!td)
168 throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document");
169#ifndef NDEBUG
170 td.opt_set(TidyQuiet,false);
171 td.opt_set(TidyShowWarnings,false);
172 td.opt_set(TidyForceOutput,true);
173 td.opt_set(TidyXhtmlOut,true);
174 td.opt_set(TidyDoctypeMode,TidyDoctypeOmit);
175 td.opt_set(TidyMark,false);
176#endif /* NDEBUG */
177 if(td.parse_string(save_html)<=0)
178 throw exception_tidy(OPKELE_CP_ "tidy failed to parse document");
179 if(td.clean_and_repair()<=0)
180 throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair");
181 util::tidy_buf_t tide;
182 if(td.save_buffer(tide)<=0)
183 throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer");
184 prepare_to_parse();
185 parse(tide.c_str(),tide.size(),true);
186 }catch(exception_tidy& et) { }
187 }
188 save_html.clear();
189 }
190
191 void prepare_to_parse() {
192 (*(expat_t*)this) = parser_create_ns();
193 set_user_data(); set_element_handler();
194 set_character_data_handler();
195
196 if(xmode&xmode_html) {
197 html_openid1.clear(); html_openid2.clear();
198 parser_choked = false;
199 }
200
201 cdata = 0; xrd_service = 0; skipping = 0;
202 status_code = 100; status_string.clear();
162 } 203 }
163 204
164 void html2xrd(XRD_t& x) { 205 void html2xrd(XRD_t& x) {
165 if(!html_openid1.uris.empty()) { 206 if(!html_openid1.uris.empty()) {
166 html_openid1.types.insert(STURI_OPENID11); 207 html_openid1.types.insert(STURI_OPENID11);
167 x.services.add(-1,html_openid1); 208 x.services.add(-1,html_openid1);
168 } 209 }
169 if(!html_openid2.uris.empty()) { 210 if(!html_openid2.uris.empty()) {
170 html_openid2.types.insert(STURI_OPENID20); 211 html_openid2.types.insert(STURI_OPENID20);
171 x.services.add(-1,html_openid2); 212 x.services.add(-1,html_openid2);
172 } 213 }
173 } 214 }
174 215
175 size_t write(void *p,size_t s,size_t nm) { 216 size_t write(void *p,size_t s,size_t nm) {
176 if(skipping<0) return 0;
177 /* TODO: limit total size */ 217 /* TODO: limit total size */
178 size_t bytes = s*nm; 218 size_t bytes = s*nm;
179 bool rp = parse((const char *)p,bytes,false); 219 const char *inbuf = (const char*)p;
220 if(xmode&xmode_html) {
221 size_t mbts = save_html.capacity()-save_html.size();
222 size_t bts = 0;
223 if(mbts>0) {
224 bts = (bytes>mbts)?mbts:bytes;
225 save_html.append(inbuf,bts);
226 }
227 if(skipping<0) return bts;
228 }
229 if(skipping<0) return 0;
230 bool rp = parse(inbuf,bytes,false);
180 if(!rp) { 231 if(!rp) {
232 parser_choked = true;
181 skipping = -1; 233 skipping = -1;
182 bytes = 0; 234 if(!(xmode&xmode_html))
235 bytes = 0;
183 } 236 }
184 return bytes; 237 return bytes;
185 } 238 }
186 size_t header(void *p,size_t s,size_t nm) { 239 size_t header(void *p,size_t s,size_t nm) {
187 size_t bytes = s*nm; 240 size_t bytes = s*nm;
188 const char *h = (const char*)p; 241 const char *h = (const char*)p;
189 const char *colon = (const char*)memchr(p,':',bytes); 242 const char *colon = (const char*)memchr(p,':',bytes);
190 const char *space = (const char*)memchr(p,' ',bytes); 243 const char *space = (const char*)memchr(p,' ',bytes);
191 if(space && ( (!colon) || space<colon ) ) { 244 if(space && ( (!colon) || space<colon ) ) {
192 xrds_location.clear(); http_content_type.clear(); 245 xrds_location.clear(); http_content_type.clear();
193 }else if(colon) { 246 }else if(colon) {
194 const char *hv = ++colon; 247 const char *hv = ++colon;
195 int hnl = colon-h; 248 int hnl = colon-h;
196 int rb; 249 int rb;
197 for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); 250 for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb);
198 while(rb>0 && isspace(hv[rb-1])) --rb; 251 while(rb>0 && isspace(hv[rb-1])) --rb;