author | Michael Krelin <hacker@klever.net> | 2007-09-14 22:52:21 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2007-09-14 22:52:21 (UTC) |
commit | 5f1d69ac7753243b93761944e9444f01d8a7e5dd (patch) (side-by-side diff) | |
tree | 43ded5d2da27a54f3fa806ddc6f054970d3cb6fc | |
parent | 429c48d3d08e6c2f6c385d8975f7b5bf5e67acf3 (diff) | |
download | libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.zip libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.gz libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.bz2 |
robustness improvements in link descovery mechanisms
accept stray spaces in link tags and inside relevant attribute values.
Thanks, Gen, for pointing it out.
Signed-off-by: Michael Krelin <hacker@klever.net>
-rw-r--r-- | lib/consumer.cc | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/consumer.cc b/lib/consumer.cc index 299b3bc..ff5da91 100644 --- a/lib/consumer.cc +++ b/lib/consumer.cc @@ -278,28 +278,28 @@ namespace opkele { CURLcode r; (r=curl_misc_sets(curl)) || (r=curl_easy_setopt(curl,CURLOPT_URL,url.c_str())) || (r=curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,_curl_tostring)) || (r=curl_easy_setopt(curl,CURLOPT_WRITEDATA,&html)) ; if(r) throw exception_curl(OPKELE_CP_ "failed to curl_easy_setopt()",r); r = curl_easy_perform(curl); if(r && r!=CURLE_WRITE_ERROR) throw exception_curl(OPKELE_CP_ "failed to curl_easy_perform()",r); // strip out everything past body - static const char *re_hdre = "<head[^>]*>", - *re_lre = "<link\\b([^>]+)>", - *re_rre = "\\brel=['\"]([^'\"]+)['\"]", - *re_hre = "\\bhref=['\"]([^'\"]+)['\"]"; + static const char *re_hdre = "<\\s*head[^>]*>", + *re_lre = "<\\s*link\\b([^>]+)>", + *re_rre = "\\brel\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]", + *re_hre = "\\bhref\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]"; #if defined(USE_LIBPCRECPP) static pcrecpp::RE_Options ro(PCRE_CASELESS|PCRE_DOTALL); static pcrecpp::RE bre("<body\\b.*",ro), hdre(re_hdre,ro), lre(re_lre,ro), rre(re_rre), hre(re_hre,ro); bre.Replace("",&html); pcrecpp::StringPiece hpiece(html); if(!hdre.FindAndConsume(&hpiece)) throw bad_input(OPKELE_CP_ "failed to find head"); string attrs; while(lre.FindAndConsume(&hpiece,&attrs)) { pcrecpp::StringPiece rel, href; |