summaryrefslogtreecommitdiffabout
authorMichael Krelin <hacker@klever.net>2007-09-14 22:52:21 (UTC)
committer Michael Krelin <hacker@klever.net>2007-09-14 22:52:21 (UTC)
commit5f1d69ac7753243b93761944e9444f01d8a7e5dd (patch) (unidiff)
tree43ded5d2da27a54f3fa806ddc6f054970d3cb6fc
parent429c48d3d08e6c2f6c385d8975f7b5bf5e67acf3 (diff)
downloadlibopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.zip
libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.gz
libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.bz2
robustness improvements in link descovery mechanisms
accept stray spaces in link tags and inside relevant attribute values. Thanks, Gen, for pointing it out. Signed-off-by: Michael Krelin <hacker@klever.net>
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--lib/consumer.cc8
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/consumer.cc b/lib/consumer.cc
index 299b3bc..ff5da91 100644
--- a/lib/consumer.cc
+++ b/lib/consumer.cc
@@ -278,28 +278,28 @@ namespace opkele {
278 CURLcode r; 278 CURLcode r;
279 (r=curl_misc_sets(curl)) 279 (r=curl_misc_sets(curl))
280 || (r=curl_easy_setopt(curl,CURLOPT_URL,url.c_str())) 280 || (r=curl_easy_setopt(curl,CURLOPT_URL,url.c_str()))
281 || (r=curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,_curl_tostring)) 281 || (r=curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,_curl_tostring))
282 || (r=curl_easy_setopt(curl,CURLOPT_WRITEDATA,&html)) 282 || (r=curl_easy_setopt(curl,CURLOPT_WRITEDATA,&html))
283 ; 283 ;
284 if(r) 284 if(r)
285 throw exception_curl(OPKELE_CP_ "failed to curl_easy_setopt()",r); 285 throw exception_curl(OPKELE_CP_ "failed to curl_easy_setopt()",r);
286 r = curl_easy_perform(curl); 286 r = curl_easy_perform(curl);
287 if(r && r!=CURLE_WRITE_ERROR) 287 if(r && r!=CURLE_WRITE_ERROR)
288 throw exception_curl(OPKELE_CP_ "failed to curl_easy_perform()",r); 288 throw exception_curl(OPKELE_CP_ "failed to curl_easy_perform()",r);
289 // strip out everything past body 289 // strip out everything past body
290 static const char *re_hdre = "<head[^>]*>", 290 static const char *re_hdre = "<\\s*head[^>]*>",
291 *re_lre = "<link\\b([^>]+)>", 291 *re_lre = "<\\s*link\\b([^>]+)>",
292 *re_rre = "\\brel=['\"]([^'\"]+)['\"]", 292 *re_rre = "\\brel\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]",
293 *re_hre = "\\bhref=['\"]([^'\"]+)['\"]"; 293 *re_hre = "\\bhref\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]";
294#if defined(USE_LIBPCRECPP) 294#if defined(USE_LIBPCRECPP)
295 static pcrecpp::RE_Options ro(PCRE_CASELESS|PCRE_DOTALL); 295 static pcrecpp::RE_Options ro(PCRE_CASELESS|PCRE_DOTALL);
296 static pcrecpp::RE 296 static pcrecpp::RE
297 bre("<body\\b.*",ro), hdre(re_hdre,ro), 297 bre("<body\\b.*",ro), hdre(re_hdre,ro),
298 lre(re_lre,ro), rre(re_rre), hre(re_hre,ro); 298 lre(re_lre,ro), rre(re_rre), hre(re_hre,ro);
299 bre.Replace("",&html); 299 bre.Replace("",&html);
300 pcrecpp::StringPiece hpiece(html); 300 pcrecpp::StringPiece hpiece(html);
301 if(!hdre.FindAndConsume(&hpiece)) 301 if(!hdre.FindAndConsume(&hpiece))
302 throw bad_input(OPKELE_CP_ "failed to find head"); 302 throw bad_input(OPKELE_CP_ "failed to find head");
303 string attrs; 303 string attrs;
304 while(lre.FindAndConsume(&hpiece,&attrs)) { 304 while(lre.FindAndConsume(&hpiece,&attrs)) {
305 pcrecpp::StringPiece rel, href; 305 pcrecpp::StringPiece rel, href;