我有以下使用HTMLAgilityPack分析某些示例HTML的C#控制台应用程序代码:
static void Main(string[] args)
{
string input = @"
<span style=""font-style: italic"">This is the title</span>.
This is the introductory text:
<ol>
<li>List Item One</li>
<li>List Item Two</li>
<li>List Item Three</li>
<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>
<li>List Item Five</li>
</ol>
This is the footer text. Last updated: July 20, 2014
";
HtmlDocument doc = new HtmlDocument();
try
{
doc.LoadHtml(input);
}
catch (Exception e)
{
LogIt("ERROR: " + e.Message);
return;
}
HtmlNode get_title = doc.DocumentNode.SelectSingleNode("//span");
if (get_title != null)
{
LogIt("Title: '" + get_title.InnerHtml + "'");
}
HtmlNodeCollection get_outer_lists = doc.DocumentNode.SelectNodes("//ol//li");
if (get_outer_lists != null)
{
foreach (HtmlNode hn_outer in get_outer_lists)
{
LogIt("Begin outer for");
LogIt("outer HTML: '" + hn_outer.OuterHtml + "'");
// Now fetch inner list, the text above the inner list, and the
// text below the inner list.
HtmlNodeCollection get_inner_lists = doc.DocumentNode.SelectNodes("//ol//li//ol//li");
if (get_inner_lists != null)
{
foreach (HtmlNode hn_inner in get_inner_lists)
{
LogIt("\tinner HTML: '" + hn_inner.OuterHtml + "'");
}
}
else
{
LogIt("ERROR: Could not get inner list");
}
}
}
else
{
LogIt("ERROR: Could not select //ol//li");
Console.Read();
return;
}
Console.Read();
return;
}
private static void LogIt(string str)
{
Console.WriteLine(str);
return;
}
...这是输出:
Title: 'This is the title'
Begin outer for
outer HTML: '<li>List Item One</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Two</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Three</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>This list item is nested:
<ol>
<li>List Item Four A.</li>
<li>List Item Four B.</li>
</ol>
Yes it is.
</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Four B.</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
Begin outer for
outer HTML: '<li>List Item Five</li>'
inner HTML: '<li>List Item Four A.</li>'
inner HTML: '<li>List Item Four B.</li>'
关于第一个问题,您可以text()
用来选择文本节点(HTML示例中的介绍性文本和页脚):
var result = doc.DocumentNode.SelectNodes("/text()");
foreach (HtmlNode r in result)
{
LogIt(r.InnerText);
}
更新 :
对于第二个问题(如果我理解正确的话),对于外循环XPath,您可以获取<ol>
具有另一个<ol>
节点后代的节点,然后<li>
从前面<ol>
提到的前者获取直接子节点,如下所示:
//ol[.//ol]/li
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句