summaryrefslogtreecommitdiffabout
authorMichael Krelin <hacker@klever.net>2007-09-14 22:52:21 (UTC)
committer Michael Krelin <hacker@klever.net>2007-09-14 22:52:21 (UTC)
commit5f1d69ac7753243b93761944e9444f01d8a7e5dd (patch) (side-by-side diff)
tree43ded5d2da27a54f3fa806ddc6f054970d3cb6fc
parent429c48d3d08e6c2f6c385d8975f7b5bf5e67acf3 (diff)
downloadlibopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.zip
libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.gz
libopkele-5f1d69ac7753243b93761944e9444f01d8a7e5dd.tar.bz2
robustness improvements in link descovery mechanisms
accept stray spaces in link tags and inside relevant attribute values. Thanks, Gen, for pointing it out. Signed-off-by: Michael Krelin <hacker@klever.net>
Diffstat (more/less context) (ignore whitespace changes)
-rw-r--r--lib/consumer.cc8
1 files changed, 4 insertions, 4 deletions
diff --git a/lib/consumer.cc b/lib/consumer.cc
index 299b3bc..ff5da91 100644
--- a/lib/consumer.cc
+++ b/lib/consumer.cc
@@ -278,28 +278,28 @@ namespace opkele {
CURLcode r;
(r=curl_misc_sets(curl))
|| (r=curl_easy_setopt(curl,CURLOPT_URL,url.c_str()))
|| (r=curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,_curl_tostring))
|| (r=curl_easy_setopt(curl,CURLOPT_WRITEDATA,&html))
;
if(r)
throw exception_curl(OPKELE_CP_ "failed to curl_easy_setopt()",r);
r = curl_easy_perform(curl);
if(r && r!=CURLE_WRITE_ERROR)
throw exception_curl(OPKELE_CP_ "failed to curl_easy_perform()",r);
// strip out everything past body
- static const char *re_hdre = "<head[^>]*>",
- *re_lre = "<link\\b([^>]+)>",
- *re_rre = "\\brel=['\"]([^'\"]+)['\"]",
- *re_hre = "\\bhref=['\"]([^'\"]+)['\"]";
+ static const char *re_hdre = "<\\s*head[^>]*>",
+ *re_lre = "<\\s*link\\b([^>]+)>",
+ *re_rre = "\\brel\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]",
+ *re_hre = "\\bhref\\s*=\\s*['\"]\\s*([^'\"\\s]+)\\s*['\"]";
#if defined(USE_LIBPCRECPP)
static pcrecpp::RE_Options ro(PCRE_CASELESS|PCRE_DOTALL);
static pcrecpp::RE
bre("<body\\b.*",ro), hdre(re_hdre,ro),
lre(re_lre,ro), rre(re_rre), hre(re_hre,ro);
bre.Replace("",&html);
pcrecpp::StringPiece hpiece(html);
if(!hdre.FindAndConsume(&hpiece))
throw bad_input(OPKELE_CP_ "failed to find head");
string attrs;
while(lre.FindAndConsume(&hpiece,&attrs)) {
pcrecpp::StringPiece rel, href;