XPath, the XML Path Language, is a query language for selecting nodes from an XML document. The following code illustrates how to extract XPath using HtmlAgilityPack and a webclient on the fly.
You need to add a reference for HtmlAgilityPack; I've used version 1.4.0.1.
You can refer to http://htmlagilitypack.codeplex.com/releases/view/44954 to download the .dll
In this article we have used it to get multiple nodes using SelectNodesByPattern, which is an extension method.
We've used this url (http://htmlagilitypack.codeplex.com/releases/view/44954) as shown in the browser and found the value of "1.4.0 Stable".
We've tried to fetch a value of all heading "h1" tags having class ='page_title', kindly refer to the following:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using HtmlAgilityPack;
using System.Net;
using System.IO;
namespace DescendantUsingXPath
{
public static class Program
{
static void Main(string[] args)
{
//WebClient object
WebClient x = new WebClient();
//Convert given url data to bytearray using DownloadData
byte[] byteArray = x.DownloadData(new Uri("http://htmlagilitypack.codeplex.com/releases/view/44954"));
//Convert Byte Array into Stram
Stream stream = new MemoryStream(byteArray);
//Create new object of HtmlAgilityPack.HtmlDocument
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
//To load stream into html object
htmlDoc.Load(stream);
//To get the value of descendent of form ,I tried to get value of all h1 having class=page_title ,
//it'd retrun all the combinations of heading h1 having class page_title
HtmlNodeCollection inputNodes = htmlDoc.DocumentNode.SelectNodesByPattern(@"descendant::form;//h
[@class='page_title']");
foreach (HtmlNode node in inputNodes)
{
string strValue = node.InnerText;
Console.WriteLine(strValue);
Console.ReadLine();
}
}
/// <summary>
/// Returns HtmlNodeCollection based on given pattern based XPath.
/// Example: "//body;//form;//table[3];//input"
/// </summary>
/// <param name="node">HtmlAgilityPack.HtmlNode</param>
/// <param name="pattern">String</param>
/// <returns>HtmlNodeCollection</returns>
public static HtmlNodeCollection SelectNodesByPattern(this HtmlNode node, string pattern)
{
if (node == null)
return null;
if (!pattern.Contains(";"))
return null;
string[] expressions = pattern.Trim().Split(new char[] { ';' });
HtmlNode tempNode = node;
int incr = 0;
for (incr = 0; incr < expressions.Length - 1; incr++)
if (tempNode != null)
tempNode = tempNode.SelectSingleNode(expressions[incr]);
if (tempNode != null)
return tempNode.SelectNodes(expressions[incr]);
return null;
}
}
}
Output