Hello, I am trying to scrape a website(its a public data) and store it in excel or some other database. I am new to all this. So I was able to download the html source to a text or excel but the data is very unorganized. Basically, I wanted to organize the data into some readable format. Following are the things I am trying to do:
1) Get data from the website with <div id="
container"> and there are links within them. So I want to go to all the links and fetch data from there.
2). The collected data should be readable and formatted.
I could get the contents of the first page but could not get into the links. Could you please suggest how I should go. I looked up and found that htmlagilitypack is a way, but I have never used it before and I am stuck. I have included the codes that I have done so far.
Thank you
This is my form class:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Net;
using System.IO;
namespace tryScrape1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
//If a correct url is entered in the textbox
try
{
//Gets the url entered by the user from the textbox
string url = textBox1.Text;
//Setting up the path for scrapped data
string directory = @"c:\temp\";
string filename = String.Format("scrapped_data.xls", DateTime.Now);
string path = Path.Combine(directory, filename);
//Class variable declaration
string sourceCode = GetSource.getSourceCode(url);
//Marks the start point of scrape
int startIndex = sourceCode.IndexOf("paddingbig");
//Marks the endpoint of the html to scrape
int endIndex = sourceCode.IndexOf("321,820");
//Gets the string between the specified startIndex and endIndex
sourceCode = sourceCode.Substring(startIndex, endIndex - startIndex);
//Request made to the url to access
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
StreamWriter sWriter = new StreamWriter(path);
sWriter.Write(sourceCode);
MessageBox.Show("Contents have been Scrapped!");
textBox1.Clear();
sWriter.Close();
}
//if the textbox is blank or incorrect url or if a url cannot be scrapped
catch(Exception)
{
MessageBox.Show("URL input cannot be blank.");
}
}
}}
GetSource Class:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.IO;
namespace tryScrape1
{
class GetSource
{
public static string getSourceCode(string url)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
StreamReader streamReader = new StreamReader(response.GetResponseStream());
string sourceCode = streamReader.ReadToEnd();
streamReader.Close();
response.Close();
return sourceCode;
}
}
}