author | Michael Krelin <hacker@klever.net> | 2008-01-05 21:47:04 (UTC) |
---|---|---|
committer | Michael Krelin <hacker@klever.net> | 2008-01-05 22:03:59 (UTC) |
commit | 76f52a8fd79dd12680752c017d67d4be01f0afbc (patch) (side-by-side diff) | |
tree | 42d640112a0381707d36ac3d72e48937978d911e | |
parent | a0719fb611507d8b9962b87c600855d8837fc266 (diff) | |
download | libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.zip libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.gz libopkele-76f52a8fd79dd12680752c017d67d4be01f0afbc.tar.bz2 |
made more robust html discovery by using htmltidy
now when parsing document that we expect might be html we also save first 16K
of the document to the buffer and if the parser choked we run the saved data
through htmltidy and feed the output to the parser again.
Signed-off-by: Michael Krelin <hacker@klever.net>
-rw-r--r-- | lib/Makefile.am | 4 | ||||
-rw-r--r-- | lib/discovery.cc | 71 |
2 files changed, 64 insertions, 11 deletions
diff --git a/lib/Makefile.am b/lib/Makefile.am index 9c73e96..989de28 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -8,10 +8,10 @@ INCLUDES = \ ${OPENSSL_CFLAGS} \ ${LIBCURL_CPPFLAGS} \ - ${PCRE_CFLAGS} ${EXPAT_CFLAGS} + ${PCRE_CFLAGS} ${EXPAT_CFLAGS} ${TIDY_CFLAGS} libopkele_la_LIBADD = \ ${LIBCURL} \ ${PCRE_LIBS} ${EXPAT_LIBS} \ ${OPENSSL_LIBS} \ - ${KONFORKA_LIBS} + ${KONFORKA_LIBS} ${TIDY_LIBS} libopkele_la_SOURCES = \ diff --git a/lib/discovery.cc b/lib/discovery.cc index 8729cfb..a308b56 100644 --- a/lib/discovery.cc +++ b/lib/discovery.cc @@ -6,4 +6,5 @@ #include <opkele/exception.h> #include <opkele/util.h> +#include <opkele/tidy.h> #include <opkele/debug.h> @@ -20,4 +21,5 @@ namespace opkele { static const char *whitespace = " \t\r\n"; static const char *i_leaders = "=@+$!("; + static const size_t max_html = 16384; static inline bool is_qelement(const XML_Char *n,const char *qen) { @@ -62,4 +64,6 @@ namespace opkele { pt_stack_t pt_stack; int skipping; + bool parser_choked; + string save_html; XRD_t *xrd; @@ -141,17 +145,13 @@ namespace opkele { throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); - (*(expat_t*)this) = parser_create_ns(); - set_user_data(); set_element_handler(); - set_character_data_handler(); - http_content_type.clear(); xmode = xm; + prepare_to_parse(); if(xmode&xmode_html) { xrds_location.clear(); - html_openid1.clear(); html_openid2.clear(); + save_html.clear(); + save_html.reserve(max_html); } xrd = &result.xrd; - cdata = 0; xrd_service = 0; skipping = 0; - status_code = 100; status_string.clear(); r = easy_perform(); @@ -159,5 +159,46 @@ namespace opkele { throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); + if(!parser_choked) { parse(0,0,true); + }else{ + /* TODO: do not bother if we've seen xml */ + try { + util::tidy_doc_t td = util::tidy_doc_t::create(); + if(!td) + throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document"); +#ifndef NDEBUG + td.opt_set(TidyQuiet,false); + td.opt_set(TidyShowWarnings,false); + td.opt_set(TidyForceOutput,true); + td.opt_set(TidyXhtmlOut,true); + td.opt_set(TidyDoctypeMode,TidyDoctypeOmit); + td.opt_set(TidyMark,false); +#endif /* NDEBUG */ + if(td.parse_string(save_html)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to parse document"); + if(td.clean_and_repair()<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair"); + util::tidy_buf_t tide; + if(td.save_buffer(tide)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer"); + prepare_to_parse(); + parse(tide.c_str(),tide.size(),true); + }catch(exception_tidy& et) { } + } + save_html.clear(); + } + + void prepare_to_parse() { + (*(expat_t*)this) = parser_create_ns(); + set_user_data(); set_element_handler(); + set_character_data_handler(); + + if(xmode&xmode_html) { + html_openid1.clear(); html_openid2.clear(); + parser_choked = false; + } + + cdata = 0; xrd_service = 0; skipping = 0; + status_code = 100; status_string.clear(); } @@ -174,10 +215,22 @@ namespace opkele { size_t write(void *p,size_t s,size_t nm) { - if(skipping<0) return 0; /* TODO: limit total size */ size_t bytes = s*nm; - bool rp = parse((const char *)p,bytes,false); + const char *inbuf = (const char*)p; + if(xmode&xmode_html) { + size_t mbts = save_html.capacity()-save_html.size(); + size_t bts = 0; + if(mbts>0) { + bts = (bytes>mbts)?mbts:bytes; + save_html.append(inbuf,bts); + } + if(skipping<0) return bts; + } + if(skipping<0) return 0; + bool rp = parse(inbuf,bytes,false); if(!rp) { + parser_choked = true; skipping = -1; + if(!(xmode&xmode_html)) bytes = 0; } |