-rw-r--r-- | lib/Makefile.am | 4 | ||||
-rw-r--r-- | lib/discovery.cc | 75 |
2 files changed, 66 insertions, 13 deletions
diff --git a/lib/Makefile.am b/lib/Makefile.am index 9c73e96..989de28 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -8,10 +8,10 @@ INCLUDES = \ ${OPENSSL_CFLAGS} \ ${LIBCURL_CPPFLAGS} \ - ${PCRE_CFLAGS} ${EXPAT_CFLAGS} + ${PCRE_CFLAGS} ${EXPAT_CFLAGS} ${TIDY_CFLAGS} libopkele_la_LIBADD = \ ${LIBCURL} \ ${PCRE_LIBS} ${EXPAT_LIBS} \ ${OPENSSL_LIBS} \ - ${KONFORKA_LIBS} + ${KONFORKA_LIBS} ${TIDY_LIBS} libopkele_la_SOURCES = \ diff --git a/lib/discovery.cc b/lib/discovery.cc index 8729cfb..a308b56 100644 --- a/lib/discovery.cc +++ b/lib/discovery.cc @@ -6,4 +6,5 @@ #include <opkele/exception.h> #include <opkele/util.h> +#include <opkele/tidy.h> #include <opkele/debug.h> @@ -20,4 +21,5 @@ namespace opkele { static const char *whitespace = " \t\r\n"; static const char *i_leaders = "=@+$!("; + static const size_t max_html = 16384; static inline bool is_qelement(const XML_Char *n,const char *qen) { @@ -62,4 +64,6 @@ namespace opkele { pt_stack_t pt_stack; int skipping; + bool parser_choked; + string save_html; XRD_t *xrd; @@ -141,17 +145,13 @@ namespace opkele { throw exception_curl(OPKELE_CP_ "failed to set culry urlie",r); - (*(expat_t*)this) = parser_create_ns(); - set_user_data(); set_element_handler(); - set_character_data_handler(); - http_content_type.clear(); xmode = xm; + prepare_to_parse(); if(xmode&xmode_html) { xrds_location.clear(); - html_openid1.clear(); html_openid2.clear(); + save_html.clear(); + save_html.reserve(max_html); } xrd = &result.xrd; - cdata = 0; xrd_service = 0; skipping = 0; - status_code = 100; status_string.clear(); r = easy_perform(); @@ -159,5 +159,46 @@ namespace opkele { throw exception_curl(OPKELE_CP_ "failed to perform curly request",r); - parse(0,0,true); + if(!parser_choked) { + parse(0,0,true); + }else{ + /* TODO: do not bother if we've seen xml */ + try { + util::tidy_doc_t td = util::tidy_doc_t::create(); + if(!td) + throw exception_tidy(OPKELE_CP_ "failed to create htmltidy document"); +#ifndef NDEBUG + td.opt_set(TidyQuiet,false); + td.opt_set(TidyShowWarnings,false); + td.opt_set(TidyForceOutput,true); + td.opt_set(TidyXhtmlOut,true); + td.opt_set(TidyDoctypeMode,TidyDoctypeOmit); + td.opt_set(TidyMark,false); +#endif /* NDEBUG */ + if(td.parse_string(save_html)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to parse document"); + if(td.clean_and_repair()<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to clean and repair"); + util::tidy_buf_t tide; + if(td.save_buffer(tide)<=0) + throw exception_tidy(OPKELE_CP_ "tidy failed to save buffer"); + prepare_to_parse(); + parse(tide.c_str(),tide.size(),true); + }catch(exception_tidy& et) { } + } + save_html.clear(); + } + + void prepare_to_parse() { + (*(expat_t*)this) = parser_create_ns(); + set_user_data(); set_element_handler(); + set_character_data_handler(); + + if(xmode&xmode_html) { + html_openid1.clear(); html_openid2.clear(); + parser_choked = false; + } + + cdata = 0; xrd_service = 0; skipping = 0; + status_code = 100; status_string.clear(); } @@ -174,11 +215,23 @@ namespace opkele { size_t write(void *p,size_t s,size_t nm) { - if(skipping<0) return 0; /* TODO: limit total size */ size_t bytes = s*nm; - bool rp = parse((const char *)p,bytes,false); + const char *inbuf = (const char*)p; + if(xmode&xmode_html) { + size_t mbts = save_html.capacity()-save_html.size(); + size_t bts = 0; + if(mbts>0) { + bts = (bytes>mbts)?mbts:bytes; + save_html.append(inbuf,bts); + } + if(skipping<0) return bts; + } + if(skipping<0) return 0; + bool rp = parse(inbuf,bytes,false); if(!rp) { + parser_choked = true; skipping = -1; - bytes = 0; + if(!(xmode&xmode_html)) + bytes = 0; } return bytes; |