目录
#if USE_HTML_PARSER_SGML
//convert html to XML document
public XmlDocument htmlToXmlDoc(string html)
{
// setup SgmlReader
SgmlReader sgmlReader = new SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
string decodedHtml = HttpUtility.HtmlDecode(html);
sgmlReader.InputStream = new StringReader(decodedHtml);
// create document
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.PreserveWhitespace = true;
xmlDoc.XmlResolver = null;
xmlDoc.Load(sgmlReader);
return xmlDoc;
}
#endif
例 13.1. htmlToXmlDoc 的使用范例
//(1) with xmlns
string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2";
string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl);
XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml);
另外,贴出,完整的示例代码:
//example code for html parse
void _demoHtmlParse()
{
#if USE_HTML_PARSER_SGML
//Method 1: use htmlToXmlDoc
//(1) with xmlns
string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2";
string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl);
XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml);
//<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
//<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" >
// <head>
// ...
XmlNamespaceManager m = new XmlNamespaceManager(xmlDocWithNs.NameTable);
m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
XmlNode titleNode = xmlDocWithNs.SelectSingleNode("//w3org:h1[@itemprop='name']", m);
string title = titleNode.InnerText;
//(2) without xmlns
string withoutXmlnsUrl = "http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_nav_0";
//<!DOCTYPE html>
//<html>
//<head>
//...
string withoutXmlnsHtml = getUrlRespHtml(withoutXmlnsUrl);
XmlDocument xmlDocNoNs = htmlToXmlDoc(withoutXmlnsHtml);
XmlNodeList pageNodeList = xmlDocNoNs.SelectNodes("//ol[@class='zg_pagination']/li[@class]");
#endif
//common part
//how to use Attributes
//XmlNodeList pageNodeList = xmlDoc.SelectNodes("//ol[@class='zg_pagination']/li[@class]");
//if (pageNodeList != null)
//{
// for (int pageIdx = 1; pageIdx < pageNodeList.Count; pageIdx++)
// {
// XmlNode curPageNode = pageNodeList[pageIdx];
// //<li class="zg_page " id="zg_page2"><a page="2" ajaxUrl="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2&ajax=1" href="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2">21-40</a></li>
// XmlNode ajaxUrlNode = curPageNode.SelectSingleNode(".//a[@href]");
// string pageUrl = ajaxUrlNode.Attributes["href"].Value;
// }
//}
#if USE_HTML_PARSER_HTMLAGILITYPACK
//Method 2: use htmlToHtmlDoc
string testUrlWithXmlns = "http://sd.csdn.net/";
string respHtml = getUrlRespHtml(testUrlWithXmlns);
//<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
//<html xmlns="http://www.w3.org/1999/xhtml">
//<head>
HtmlAgilityPack.HtmlDocument htmlDoc = htmlToHtmlDoc(respHtml);
//<div class="tabcontent" id="sc1">
// <ul>
// <li><a href="http://www.csdn.net/article/tag/%E4%BA%A7%E5%93%81" target="_blank">产品</a></li>
// ......
// <li><a href="http://www.csdn.net/article/tag/%E8%AE%BE%E8%AE%A1" target="_blank">设计</a></li>
// </ul>
//</div>
//...
//<div class="tabcontent" id="sc4">
// <ul>
// ...
// <li><a href="http://www.csdn.net/article/tag/%E6%95%B0%E6%8D%AE%E5%BA%93" target="_blank">数据库</a></li>
// </ul>
//</div>
//here, no need to take care the html xmlns
//is better than SGMLReader
HtmlNode rootHtmlNode = htmlDoc.DocumentNode;
HtmlNodeCollection htmlNodes = rootHtmlNode.SelectNodes("//div[@class='tabcontent']");
foreach (HtmlNode link in htmlNodes)
{
HtmlAttribute att = link.Attributes["id"];
string idHref = att.Value;
}