author | Michael Krelin <hacker@klever.net> | 2008-01-05 21:47:04 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2008-01-05 22:03:59 (UTC) |
commit | 76f52a8fd79dd12680752c017d67d4be01f0afbc (patch) (unidiff) | |
tree | 42d640112a0381707d36ac3d72e48937978d911e /lib/discovery.cc | |
parent | a0719fb611507d8b9962b87c600855d8837fc266 (diff) | |
download | libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.zip libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.gz libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.bz2 |
made more robust html discovery by using htmltidy
now when parsing document that we expect might be html we also save first 16K
of the document to the buffer and if the parser choked we run the saved data
through htmltidy and feed the output to the parser again.
Signed-off-by: Michael Krelin <hacker@klever.net>
-rw-r--r-- | lib/discovery.cc | 75 |
1 files changed, 64 insertions, 11 deletions
diff --git a/lib/discovery.cc b/lib/discovery.cc index 8729cfb..a308b56 100644 --- a/lib/discovery.cc +++ b/lib/discovery.cc | |||
@@ -1,230 +1,283 @@ | |||
1 | #include <list> | 1 | #include <list> |
2 | #include <opkele/curl.h> | 2 | #include <opkele/curl.h> |
3 | #include <opkele/expat.h> | 3 | #include <opkele/expat.h> |
4 | #include <opkele/uris.h> | 4 | #include <opkele/uris.h> |
5 | #include <opkele/discovery.h> | 5 | #include <opkele/discovery.h> |
6 | #include <opkele/exception.h> | 6 | #include <opkele/exception.h> |
7 | #include <opkele/util.h> | 7 | #include <opkele/util.h> |
8 | #include <opkele/tidy.h> | ||
8 | #include <opkele/debug.h> | 9 | #include <opkele/debug.h> |
9 | 10 | ||
10 | #include "config.h" | 11 | #include "config.h" |
11 | 12 | ||
12 | #define XRDS_HEADER "X-XRDS-Location" | 13 | #define XRDS_HEADER "X-XRDS-Location" |
13 | #define CT_HEADER "Content-Type" | 14 | #define CT_HEADER "Content-Type" |
14 | 15 | ||
15 | namespace opkele { | 16 | namespace opkele { |
16 | using std::list; | 17 | using std::list; |
17 | using xrd::XRD_t; | 18 | using xrd::XRD_t; |
18 | using xrd::service_t; | 19 | using xrd::service_t; |
19 | 20 | ||
20 | static const char *whitespace = " \t\r\n"; | 21 | static const char *whitespace = " \t\r\n"; |
21 | static const char *i_leaders = "=@+$!("; | 22 | static const char *i_leaders = "=@+$!("; |
23 | static const size_t max_html = 16384; | ||
22 | 24 | ||
23 | static inline bool is_qelement(const XML_Char *n,const char *qen) { | 25 | static inline bool is_qelement(const XML_Char *n,const char *qen) { |
24 | return !strcasecmp(n,qen); | 26 | return !strcasecmp(n,qen); |
25 | } | 27 | } |
26 | static inline bool is_element(const XML_Char *n,const char *en) { | 28 | static inline bool is_element(const XML_Char *n,const char *en) { |
27 | if(!strcasecmp(n,en)) return true; | 29 | if(!strcasecmp(n,en)) return true; |
28 | int nl = strlen(n), enl = strlen(en); | 30 | int nl = strlen(n), enl = strlen(en); |
29 | if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' | 31 | if( (nl>=(enl+1)) && n[nl-enl-1]=='\t' |
30 | && !strcasecmp(&n[nl-enl],en) ) | 32 | && !strcasecmp(&n[nl-enl],en) ) |
31 | return true; | 33 | return true; |
32 | return false; | 34 | return false; |
33 | } | 35 | } |
34 | 36 | ||
35 | static long element_priority(const XML_Char **a) { | 37 | static long element_priority(const XML_Char **a) { |
36 | for(;*a;++a) | 38 | for(;*a;++a) |
37 | if(!strcasecmp(*(a++),"priority")) { | 39 | if(!strcasecmp(*(a++),"priority")) { |
38 | long rv; | 40 | long rv; |
39 | return (sscanf(*a,"%ld",&rv)==1)?rv:-1; | 41 | return (sscanf(*a,"%ld",&rv)==1)?rv:-1; |
40 | } | 42 | } |
41 | return -1; | 43 | return -1; |
42 | } | 44 | } |
43 | 45 | ||
44 | class idigger_t : public util::curl_t, public util::expat_t { | 46 | class idigger_t : public util::curl_t, public util::expat_t { |
45 | public: | 47 | public: |
46 | string xri_proxy; | 48 | string xri_proxy; |
47 | 49 | ||
48 | enum { | 50 | enum { |
49 | xmode_html = 1, xmode_xrd = 2 | 51 | xmode_html = 1, xmode_xrd = 2 |
50 | }; | 52 | }; |
51 | int xmode; | 53 | int xmode; |
52 | 54 | ||
53 | string xrds_location; | 55 | string xrds_location; |
54 | string http_content_type; | 56 | string http_content_type; |
55 | service_t html_openid1; | 57 | service_t html_openid1; |
56 | service_t html_openid2; | 58 | service_t html_openid2; |
57 | string cdata_buf; | 59 | string cdata_buf; |
58 | long status_code; | 60 | long status_code; |
59 | string status_string; | 61 | string status_string; |
60 | 62 | ||
61 | typedef list<string> pt_stack_t; | 63 | typedef list<string> pt_stack_t; |
62 | pt_stack_t pt_stack; | 64 | pt_stack_t pt_stack; |
63 | int skipping; | 65 | int skipping; |
66 | bool parser_choked; | ||
67 | string save_html; | ||
64 | 68 | ||
65 | XRD_t *xrd; | 69 | XRD_t *xrd; |
66 | service_t *xrd_service; | 70 | service_t *xrd_service; |
67 | string* cdata; | 71 | string* cdata; |
68 | 72 | ||
69 | idigger_t() | 73 | idigger_t() |
70 | : util::curl_t(easy_init()), | 74 | : util::curl_t(easy_init()), |
71 | util::expat_t(0), | 75 | util::expat_t(0), |
72 | xri_proxy(XRI_PROXY_URL) { | 76 | xri_proxy(XRI_PROXY_URL) { |
73 | CURLcode r; | 77 | CURLcode r; |
74 | (r=misc_sets()) | 78 | (r=misc_sets()) |
75 | || (r=set_write()) | 79 | || (r=set_write()) |
76 | || (r=set_header()) | 80 | || (r=set_header()) |
77 | ; | 81 | ; |
78 | if(r) | 82 | if(r) |
79 | throw exception_curl(OPKELE_CP_ "failed to set curly options",r); | 83 | throw exception_curl(OPKELE_CP_ "failed to set curly options",r); |
80 | } | 84 | } |
81 | ~idigger_t() throw() { } | 85 | ~idigger_t() throw() { } |
82 | 86 | ||
83 | void discover(idiscovery_t& result,const string& identity) { | 87 | void discover(idiscovery_t& result,const string& identity) { |
84 | result.clear(); | 88 | result.clear(); |
85 | string::size_type fsc = identity.find_first_not_of(whitespace); | 89 | string::size_type fsc = identity.find_first_not_of(whitespace); |
86 | if(fsc==string::npos) | 90 | if(fsc==string::npos) |
87 | throw bad_input(OPKELE_CP_ "whtiespace-only identity"); | 91 | throw bad_input(OPKELE_CP_ "whtiespace-only identity"); |
88 | string::size_type lsc = identity.find_last_not_of(whitespace); | 92 | string::size_type lsc = identity.find_last_not_of(whitespace); |
89 | assert(lsc!=string::npos); | 93 | assert(lsc!=string::npos); |
90 | if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1)) | 94 | if(!strncasecmp(identity.c_str()+fsc,"xri://",sizeof("xri://")-1)) |
91 | fsc += sizeof("xri://")-1; | 95 | fsc += sizeof("xri://")-1; |
92 | if((fsc+1)>=lsc) | 96 | if((fsc+1)>=lsc) |
93 | throw bad_input(OPKELE_CP_ "not a character of importance in identity"); | 97 | throw bad_input(OPKELE_CP_ "not a character of importance in identity"); |
94 | string id(identity,fsc,lsc-fsc+1); | 98 | string id(identity,fsc,lsc-fsc+1); |
95 | if(strchr(i_leaders,id[0])) { | 99 | if(strchr(i_leaders,id[0])) { |
96 | result.normalized_id = id; | 100 | result.normalized_id = id; |
97 | result.xri_identity = true; | 101 | result.xri_identity = true; |
98 | /* TODO: further canonicalize xri identity? Like folding case or whatever... */ | 102 | /* TODO: further canonicalize xri identity? Like folding case or whatever... */ |
99 | discover_at( | 103 | discover_at( |
100 | result, | 104 | result, |
101 | xri_proxy + util::url_encode(id)+ | 105 | xri_proxy + util::url_encode(id)+ |
102 | "?_xrd_r=application/xrd+xml;sep=false", xmode_xrd); | 106 | "?_xrd_r=application/xrd+xml;sep=false", xmode_xrd); |
103 | if(status_code!=100) | 107 | if(status_code!=100) |
104 | throw failed_xri_resolution(OPKELE_CP_ | 108 | throw failed_xri_resolution(OPKELE_CP_ |
105 | "XRI resolution failed with '"+status_string+"' message",status_code); | 109 | "XRI resolution failed with '"+status_string+"' message",status_code); |
106 | if(result.xrd.canonical_ids.empty()) | 110 | if(result.xrd.canonical_ids.empty()) |
107 | throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found"); | 111 | throw opkele::failed_discovery(OPKELE_CP_ "No CanonicalID for XRI identity found"); |
108 | result.canonicalized_id = result.xrd.canonical_ids.begin()->second; | 112 | result.canonicalized_id = result.xrd.canonical_ids.begin()->second; |
109 | }else{ | 113 | }else{ |
110 | result.xri_identity = false; | 114 | result.xri_identity = false; |
111 | if(id.find("://")==string::npos) | 115 | if(id.find("://")==string::npos) |
112 | id.insert(0,"http://"); | 116 | id.insert(0,"http://"); |
113 | string::size_type fp = id.find('#'); | 117 | string::size_type fp = id.find('#'); |
114 | if(fp!=string::npos) { | 118 | if(fp!=string::npos) { |
115 | string::size_type qp = id.find('?'); | 119 | string::size_type qp = id.find('?'); |
116 | if(qp==string::npos || qp<fp) | 120 | if(qp==string::npos || qp<fp) |
117 | id.erase(fp); | 121 | id.erase(fp); |
118 | else if(qp>fp) | 122 | else if(qp>fp) |
119 | id.erase(fp,qp-fp); | 123 | id.erase(fp,qp-fp); |
120 | } | 124 | } |
121 | result.normalized_id = util::rfc_3986_normalize_uri(id); | 125 | result.normalized_id = util::rfc_3986_normalize_uri(id); |
122 | discover_at(result,id,xmode_html|xmode_xrd); | 126 | discover_at(result,id,xmode_html|xmode_xrd); |
123 | const char * eu = 0; | 127 | const char * eu = 0; |
124 | CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu); | 128 | CURLcode r = easy_getinfo(CURLINFO_EFFECTIVE_URL,&eu); |
125 | if(r) | 129 | if(r) |
126 | throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r); | 130 | throw exception_curl(OPKELE_CP_ "failed to get CURLINFO_EFFECTIVE_URL",r); |
127 | result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ | 131 | result.canonicalized_id = util::rfc_3986_normalize_uri(eu); /* XXX: strip fragment part? */ |
128 | if(xrds_location.empty()) { | 132 | if(xrds_location.empty()) { |
129 | html2xrd(result.xrd); | 133 | html2xrd(result.xrd); |
130 | }else{ | 134 | }else{ |
131 | discover_at(result,xrds_location,xmode_xrd); | 135 | discover_at(result,xrds_location,xmode_xrd); |
132 | if(result.xrd.empty()) | 136 | if(result.xrd.empty()) |
133 | html2xrd(result.xrd); | 137 | html2xrd(result.xrd); |
134 | } | 138 | } |
135 | } | 139 | } |
136 | } | 140 | } |
137 | 141 | ||
138 | void discover_at(idiscovery_t& result,const string& url,int xm) { | 142 | void discover_at(idiscovery_t& result,const string& url,int xm) { |
139 | CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); | 143 | CURLcode r = easy_setopt(CURLOPT_URL,url.c_str()); |
140 | if(r) | 144 | if(r) |
141 | throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); | 145 | throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); |
142 | 146 | ||
143 | (*(expat_t*)this) = parser_create_ns(); | ||
144 | set_user_data(); set_element_handler(); | ||
145 | set_character_data_handler(); | ||
146 | |||
147 | http_content_type.clear(); | 147 | http_content_type.clear(); |
148 | xmode = xm; | 148 | xmode = xm; |
149 | prepare_to_parse(); | ||
149 | if(xmode&xmode_html) { | 150 | if(xmode&xmode_html) { |
150 | xrds_location.clear(); | 151 | xrds_location.clear(); |
151 | html_openid1.clear(); html_openid2.clear(); | 152 | save_html.clear(); |
153 | save_html.reserve(max_html); | ||
152 | } | 154 | } |
153 | xrd = &result.xrd; | 155 | xrd = &result.xrd; |
154 | cdata = 0; xrd_service = 0; skipping = 0; | ||
155 | status_code = 100; status_string.clear(); | ||
156 | 156 | ||
157 | r = easy_perform(); | 157 | r = easy_perform(); |
158 | if(r && r!=CURLE_WRITE_ERROR) | 158 | if(r && r!=CURLE_WRITE_ERROR) |
159 | throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); | 159 | throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); |
160 | 160 | ||
161 | parse(0,0,true); | 161 | if(!parser_choked) { |
162 | parse(0,0,true); | ||
163 | }else{ | ||
164 | /* TODO: do not bother if we've seen xml */ | ||
165 | try { | ||
166 | util::tidy_doc_t td = util::tidy_doc_t::create(); | ||
167 | if(!td) | ||
168 | throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document"); | ||
169 | #ifndef NDEBUG | ||
170 | td.opt_set(TidyQuiet,false); | ||
171 | td.opt_set(TidyShowWarnings,false); | ||
172 | td.opt_set(TidyForceOutput,true); | ||
173 | td.opt_set(TidyXhtmlOut,true); | ||
174 | td.opt_set(TidyDoctypeMode,TidyDoctypeOmit); | ||
175 | td.opt_set(TidyMark,false); | ||
176 | #endif /* NDEBUG */ | ||
177 | if(td.parse_string(save_html)<=0) | ||
178 | throw exception_tidy(OPKELE_CP_ "tidy failed to parse document"); | ||
179 | if(td.clean_and_repair()<=0) | ||
180 | throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair"); | ||
181 | util::tidy_buf_t tide; | ||
182 | if(td.save_buffer(tide)<=0) | ||
183 | throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer"); | ||
184 | prepare_to_parse(); | ||
185 | parse(tide.c_str(),tide.size(),true); | ||
186 | }catch(exception_tidy& et) { } | ||
187 | } | ||
188 | save_html.clear(); | ||
189 | } | ||
190 | |||
191 | void prepare_to_parse() { | ||
192 | (*(expat_t*)this) = parser_create_ns(); | ||
193 | set_user_data(); set_element_handler(); | ||
194 | set_character_data_handler(); | ||
195 | |||
196 | if(xmode&xmode_html) { | ||
197 | html_openid1.clear(); html_openid2.clear(); | ||
198 | parser_choked = false; | ||
199 | } | ||
200 | |||
201 | cdata = 0; xrd_service = 0; skipping = 0; | ||
202 | status_code = 100; status_string.clear(); | ||
162 | } | 203 | } |
163 | 204 | ||
164 | void html2xrd(XRD_t& x) { | 205 | void html2xrd(XRD_t& x) { |
165 | if(!html_openid1.uris.empty()) { | 206 | if(!html_openid1.uris.empty()) { |
166 | html_openid1.types.insert(STURI_OPENID11); | 207 | html_openid1.types.insert(STURI_OPENID11); |
167 | x.services.add(-1,html_openid1); | 208 | x.services.add(-1,html_openid1); |
168 | } | 209 | } |
169 | if(!html_openid2.uris.empty()) { | 210 | if(!html_openid2.uris.empty()) { |
170 | html_openid2.types.insert(STURI_OPENID20); | 211 | html_openid2.types.insert(STURI_OPENID20); |
171 | x.services.add(-1,html_openid2); | 212 | x.services.add(-1,html_openid2); |
172 | } | 213 | } |
173 | } | 214 | } |
174 | 215 | ||
175 | size_t write(void *p,size_t s,size_t nm) { | 216 | size_t write(void *p,size_t s,size_t nm) { |
176 | if(skipping<0) return 0; | ||
177 | /* TODO: limit total size */ | 217 | /* TODO: limit total size */ |
178 | size_t bytes = s*nm; | 218 | size_t bytes = s*nm; |
179 | bool rp = parse((const char *)p,bytes,false); | 219 | const char *inbuf = (const char*)p; |
220 | if(xmode&xmode_html) { | ||
221 | size_t mbts = save_html.capacity()-save_html.size(); | ||
222 | size_t bts = 0; | ||
223 | if(mbts>0) { | ||
224 | bts = (bytes>mbts)?mbts:bytes; | ||
225 | save_html.append(inbuf,bts); | ||
226 | } | ||
227 | if(skipping<0) return bts; | ||
228 | } | ||
229 | if(skipping<0) return 0; | ||
230 | bool rp = parse(inbuf,bytes,false); | ||
180 | if(!rp) { | 231 | if(!rp) { |
232 | parser_choked = true; | ||
181 | skipping = -1; | 233 | skipping = -1; |
182 | bytes = 0; | 234 | if(!(xmode&xmode_html)) |
235 | bytes = 0; | ||
183 | } | 236 | } |
184 | return bytes; | 237 | return bytes; |
185 | } | 238 | } |
186 | size_t header(void *p,size_t s,size_t nm) { | 239 | size_t header(void *p,size_t s,size_t nm) { |
187 | size_t bytes = s*nm; | 240 | size_t bytes = s*nm; |
188 | const char *h = (const char*)p; | 241 | const char *h = (const char*)p; |
189 | const char *colon = (const char*)memchr(p,':',bytes); | 242 | const char *colon = (const char*)memchr(p,':',bytes); |
190 | const char *space = (const char*)memchr(p,' ',bytes); | 243 | const char *space = (const char*)memchr(p,' ',bytes); |
191 | if(space && ( (!colon) || space<colon ) ) { | 244 | if(space && ( (!colon) || space<colon ) ) { |
192 | xrds_location.clear(); http_content_type.clear(); | 245 | xrds_location.clear(); http_content_type.clear(); |
193 | }else if(colon) { | 246 | }else if(colon) { |
194 | const char *hv = ++colon; | 247 | const char *hv = ++colon; |
195 | int hnl = colon-h; | 248 | int hnl = colon-h; |
196 | int rb; | 249 | int rb; |
197 | for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); | 250 | for(rb = bytes-hnl-1;rb>0 && isspace(*hv);++hv,--rb); |
198 | while(rb>0 && isspace(hv[rb-1])) --rb; | 251 | while(rb>0 && isspace(hv[rb-1])) --rb; |
199 | if(rb) { | 252 | if(rb) { |
200 | if( (hnl>=sizeof(XRDS_HEADER)) | 253 | if( (hnl>=sizeof(XRDS_HEADER)) |
201 | && !strncasecmp(h,XRDS_HEADER":", | 254 | && !strncasecmp(h,XRDS_HEADER":", |
202 | sizeof(XRDS_HEADER)) ) { | 255 | sizeof(XRDS_HEADER)) ) { |
203 | xrds_location.assign(hv,rb); | 256 | xrds_location.assign(hv,rb); |
204 | }else if( (hnl>=sizeof(CT_HEADER)) | 257 | }else if( (hnl>=sizeof(CT_HEADER)) |
205 | && !strncasecmp(h,CT_HEADER":", | 258 | && !strncasecmp(h,CT_HEADER":", |
206 | sizeof(CT_HEADER)) ) { | 259 | sizeof(CT_HEADER)) ) { |
207 | const char *sc = (const char*)memchr( | 260 | const char *sc = (const char*)memchr( |
208 | hv,';',rb); | 261 | hv,';',rb); |
209 | http_content_type.assign(hv,sc?(sc-hv):rb); | 262 | http_content_type.assign(hv,sc?(sc-hv):rb); |
210 | } | 263 | } |
211 | } | 264 | } |
212 | } | 265 | } |
213 | return curl_t::header(p,s,nm); | 266 | return curl_t::header(p,s,nm); |
214 | } | 267 | } |
215 | 268 | ||
216 | void start_element(const XML_Char *n,const XML_Char **a) { | 269 | void start_element(const XML_Char *n,const XML_Char **a) { |
217 | if(skipping<0) return; | 270 | if(skipping<0) return; |
218 | if(skipping) { | 271 | if(skipping) { |
219 | if(xmode&xmode_html) | 272 | if(xmode&xmode_html) |
220 | html_start_element(n,a); | 273 | html_start_element(n,a); |
221 | ++skipping; return; | 274 | ++skipping; return; |
222 | } | 275 | } |
223 | if(pt_stack.empty()) { | 276 | if(pt_stack.empty()) { |
224 | if(is_qelement(n,NSURI_XRDS "\tXRDS")) | 277 | if(is_qelement(n,NSURI_XRDS "\tXRDS")) |
225 | return; | 278 | return; |
226 | if(is_qelement(n,NSURI_XRD "\tXRD")) { | 279 | if(is_qelement(n,NSURI_XRD "\tXRD")) { |
227 | assert(xrd); | 280 | assert(xrd); |
228 | xrd->clear(); | 281 | xrd->clear(); |
229 | pt_stack.push_back(n); | 282 | pt_stack.push_back(n); |
230 | }else if(xmode&xmode_html) { | 283 | }else if(xmode&xmode_html) { |